This report outlines the analysis of a subset of the Bottomly data set, consisting of 11 replicates of striatal tissue from DBA/2J mice, and 10 replicates of striatal tissue from C57BL/6J mice.
basedir <- "/home/charlotte/gene_vs_tx_quantification"
refdir <- "/home/Shared/data/annotation"
suppressPackageStartupMessages(library(Rsubread))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(iCOBRA))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(BiocParallel))
suppressPackageStartupMessages(library(DESeq2, lib.loc = "/home/charlotte/R/x86_64-pc-linux-gnu-library/3.2"))
suppressPackageStartupMessages(library(DEXSeq))
The code below downloads the FASTQ files for the six samples from the SRA.
fastq_files <- c(paste0("ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/",
"fastq/SRA026/SRA026846/SRX0334",
setdiff(72:94, c(77, 87)), "/SRR0992",
23:43, ".fastq.bz2"))
for (fq in fastq_files) {
if (!file.exists(paste0(basedir, "/data/bottomly/fastq/", basename(fq)))) {
cmd <- paste0("wget -P ", basedir, "/data/bottomly/fastq ", fq)
message(cmd)
system(cmd)
} else {
message(paste0(fq, " is already downloaded."))
}
}
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033472/SRR099223.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033473/SRR099224.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033474/SRR099225.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033475/SRR099226.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033476/SRR099227.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033478/SRR099228.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033479/SRR099229.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033480/SRR099230.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033481/SRR099231.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033482/SRR099232.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033483/SRR099233.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033484/SRR099234.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033485/SRR099235.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033486/SRR099236.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033488/SRR099237.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033489/SRR099238.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033490/SRR099239.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033491/SRR099240.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033492/SRR099241.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033493/SRR099242.fastq.bz2 is already downloaded.
## ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA026/SRA026846/SRX033494/SRR099243.fastq.bz2 is already downloaded.
meta <- read.delim(paste0(basedir, "/data/bottomly/bottomly_phenodata.txt"),
header = TRUE, as.is = TRUE)
rownames(meta) <- meta$srr.id
meta
## sample.id srr.id num.tech.reps strain experiment.number lane.number
## SRR099230 SRX033480 SRR099230 1 C57BL/6J 6 1
## SRR099237 SRX033488 SRR099237 1 C57BL/6J 7 1
## SRR099231 SRX033481 SRR099231 1 C57BL/6J 6 2
## SRR099238 SRX033489 SRR099238 1 C57BL/6J 7 2
## SRR099232 SRX033482 SRR099232 1 C57BL/6J 6 3
## SRR099239 SRX033490 SRR099239 1 C57BL/6J 7 3
## SRR099233 SRX033483 SRR099233 1 C57BL/6J 6 5
## SRR099227 SRX033476 SRR099227 1 C57BL/6J 4 6
## SRR099228 SRX033478 SRR099228 1 C57BL/6J 4 7
## SRR099229 SRX033479 SRR099229 1 C57BL/6J 4 8
## SRR099223 SRX033472 SRR099223 1 DBA/2J 4 1
## SRR099224 SRX033473 SRR099224 1 DBA/2J 4 2
## SRR099225 SRX033474 SRR099225 1 DBA/2J 4 3
## SRR099226 SRX033475 SRR099226 1 DBA/2J 4 5
## SRR099240 SRX033491 SRR099240 1 DBA/2J 7 5
## SRR099234 SRX033484 SRR099234 1 DBA/2J 6 6
## SRR099241 SRX033492 SRR099241 1 DBA/2J 7 6
## SRR099235 SRX033485 SRR099235 1 DBA/2J 6 7
## SRR099242 SRX033493 SRR099242 1 DBA/2J 7 7
## SRR099236 SRX033486 SRR099236 1 DBA/2J 6 8
## SRR099243 SRX033494 SRR099243 1 DBA/2J 7 8
cdna_fasta <- paste0(refdir, "/Mouse/Ensembl_GRCm38.82/cDNA/Mus_musculus.GRCm38.cdna.all.fa.gz")
salmon_index <- paste0(basedir, "/annotation/Mouse_Ensembl_GRCm38.82/salmon_index/Mus_musculus.GRCm38.82.gtf.sidx")
genome_fasta <- paste0(refdir, "/Mouse/Ensembl_GRCm38.82/genome/Mus_musculus.GRCm38.dna.toplevel.fa")
gtf <- paste0(refdir, "/Mouse/Ensembl_GRCm38.82/GTF/Mus_musculus.GRCm38.82.gtf")
star_index_dir <- paste0(basedir, "/annotation/Mouse_Ensembl_GRCm38.82/STAR_125")
## Download reference files if needed
############################### GENOME FASTA ##############################
if (!file.exists(genome_fasta)) {
cmd <- paste0("wget -P ", refdir, "/Mouse/Ensembl_GRCm38.82/genome ",
"ftp://ftp.ensembl.org/pub/release-82/fasta/mus_musculus/",
"dna/Mus_musculus.GRCm38.dna.toplevel.fa.gz")
message(cmd)
system(cmd)
cmd <- paste0("gunzip ", genome_fasta, ".gz")
message(cmd)
system(cmd)
} else {
message(paste0("Reference genome is already downloaded."))
}
## Reference genome is already downloaded.
############################### cDNA FASTA ################################
if (!file.exists(cdna_fasta)) {
cmd <- paste0("wget -P ", refdir, "/Mouse/Ensembl_GRCm38.82/cDNA ",
"ftp://ftp.ensembl.org/pub/release-82/fasta/mus_musculus/",
"cdna/Mus_musculus.GRCm38.cdna.all.fa.gz")
message(cmd)
system(cmd)
cmd <- paste0("gunzip -c ", cdna_fasta, "> ", gsub("\\.gz$", "", cdna_fasta))
message(cmd)
system(cmd)
} else {
message(paste0("Reference cDNA fasta is already downloaded."))
}
## Reference cDNA fasta is already downloaded.
#################################### GTF ###################################
if (!file.exists(gtf)) {
cmd <- paste0("wget -P ", refdir, "/Mouse/Ensembl_GRCm38.82/GTF ",
"ftp://ftp.ensembl.org/pub/release-82/gtf/mus_musculus/",
"Mus_musculus.GRCm38.82.gtf.gz")
message(cmd)
system(cmd)
cmd <- paste0("gunzip ", gtf, ".gz")
message(cmd)
system(cmd)
} else {
message(paste0("Reference GTF file is already downloaded."))
}
## Reference GTF file is already downloaded.
## Build STAR index
cmd <- paste0("STAR --runMode genomeGenerate --runThreadN 10 ",
"--genomeDir ", star_index_dir, " --genomeFastaFiles ",
genome_fasta, " --sjdbGTFfile ", gtf, " --sjdbOverhang 125")
message(cmd)
## STAR --runMode genomeGenerate --runThreadN 10 --genomeDir /home/charlotte/gene_vs_tx_quantification/annotation/Mouse_Ensembl_GRCm38.82/STAR_125 --genomeFastaFiles /home/Shared/data/annotation/Mouse/Ensembl_GRCm38.82/genome/Mus_musculus.GRCm38.dna.toplevel.fa --sjdbGTFfile /home/Shared/data/annotation/Mouse/Ensembl_GRCm38.82/GTF/Mus_musculus.GRCm38.82.gtf --sjdbOverhang 125
if (!file.exists(paste0(star_index_dir, "/SA"))) {
system(cmd)
} else {
message(paste0("STAR index already exists."))
}
## STAR index already exists.
## Build Salmon index
cmd <- paste("salmon index -i", salmon_index, "-t", gsub("\\.gz$", "", cdna_fasta), "-p 5 --type quasi")
message(cmd)
## salmon index -i /home/charlotte/gene_vs_tx_quantification/annotation/Mouse_Ensembl_GRCm38.82/salmon_index/Mus_musculus.GRCm38.82.gtf.sidx -t /home/Shared/data/annotation/Mouse/Ensembl_GRCm38.82/cDNA/Mus_musculus.GRCm38.cdna.all.fa -p 5 --type quasi
if (!file.exists(paste0(salmon_index, "/hash.bin"))) {
system(cmd)
} else {
message("Salmon index already exists.")
}
## Salmon index already exists.
feature_lengths_file <- paste0(basedir, "/annotation/Mouse_Ensembl_GRCm38.82/feature_lengths.Rdata")
tx_gene_file <- paste0(basedir, "/annotation/Mouse_Ensembl_GRCm38.82/tx_gene_map.Rdata")
salmon_basedir <- paste0(basedir, "/quantifications/bottomly/salmon")
cv_salmon_file <- paste0(basedir, "/quantifications/bottomly/salmon/cv_salmon.Rdata")
## Calculate gene and transcript lengths, get gene-transcript mapping
if (!file.exists(feature_lengths_file)) {
calc_lengths_mapping(gtf = gtf, cdna_fasta = cdna_fasta,
feature_lengths_file = feature_lengths_file,
tx_gene_file = tx_gene_file)
} else {
message("feature lengths and tx-to-gene map already calculated.")
}
## feature lengths and tx-to-gene map already calculated.
load(feature_lengths_file)
load(tx_gene_file)
fqfiles <- list.files(paste0(basedir, "/data/bottomly/fastq"), full.names = TRUE)
for (fq in fqfiles) {
if (!file.exists(paste0(basedir, "/data/bottomly/bam/",
gsub("\\.fastq.bz2", "", basename(fq)), "/Aligned.out.bam"))) {
cmd <- paste0("STAR --genomeDir ", star_index_dir, " --readFilesIn ",
fq, " --runThreadN 10 ",
"--outFileNamePrefix ", basedir, "/data/bottomly/bam/",
gsub("\\.fastq.bz2", "", basename(fq)), "/",
" --outSAMtype BAM Unsorted --readFilesCommand bunzip2 -c")
cat(cmd, "\n")
system(cmd)
} else {
message(paste0("Sample ", basename(fq), " is already aligned."))
}
}
## Sample SRR099223.fastq.bz2 is already aligned.
## Sample SRR099224.fastq.bz2 is already aligned.
## Sample SRR099225.fastq.bz2 is already aligned.
## Sample SRR099226.fastq.bz2 is already aligned.
## Sample SRR099227.fastq.bz2 is already aligned.
## Sample SRR099228.fastq.bz2 is already aligned.
## Sample SRR099229.fastq.bz2 is already aligned.
## Sample SRR099230.fastq.bz2 is already aligned.
## Sample SRR099231.fastq.bz2 is already aligned.
## Sample SRR099232.fastq.bz2 is already aligned.
## Sample SRR099233.fastq.bz2 is already aligned.
## Sample SRR099234.fastq.bz2 is already aligned.
## Sample SRR099235.fastq.bz2 is already aligned.
## Sample SRR099236.fastq.bz2 is already aligned.
## Sample SRR099237.fastq.bz2 is already aligned.
## Sample SRR099238.fastq.bz2 is already aligned.
## Sample SRR099239.fastq.bz2 is already aligned.
## Sample SRR099240.fastq.bz2 is already aligned.
## Sample SRR099241.fastq.bz2 is already aligned.
## Sample SRR099242.fastq.bz2 is already aligned.
## Sample SRR099243.fastq.bz2 is already aligned.
bamdir <- list.files(paste0(basedir, "/data/bottomly/bam"), full.names = TRUE)
bamdir <- bamdir[file.info(bamdir)$isdir]
bamfiles <- paste0(bamdir, "/Aligned.out.bam")
names(bamfiles) <- basename(bamdir)
fc_bottomly_file <- paste0(basedir, "/quantifications/bottomly/featureCounts/fc_bottomly.Rdata")
if (!file.exists(fc_bottomly_file)) {
fc_bottomly <- Rsubread::featureCounts(
files = bamfiles,
annot.ext = gtf,
isGTFAnnotationFile = TRUE,
GTF.featureType = "exon",
GTF.attrType = "gene_id",
useMetaFeatures = TRUE,
isPairedEnd = FALSE, nthreads = 5,
strandSpecific = 0, countMultiMappingReads = FALSE
)
colnames(fc_bottomly$counts) <- sapply(colnames(fc_bottomly$counts), function(w) {
s <- strsplit(w, "\\.")[[1]]
s[grep("SRR", s)]
})
colnames(fc_bottomly$stat) <- sapply(colnames(fc_bottomly$stat), function(w) {
s <- strsplit(w, "\\.")[[1]]
s[grep("SRR", s)]
})
save(fc_bottomly, file = fc_bottomly_file)
} else {
message("Loading previously saved file with featureCounts quantifications.")
load(fc_bottomly_file)
}
## Loading previously saved file with featureCounts quantifications.
bamdir <- list.files(paste0(basedir, "/data/bottomly/bam"), full.names = TRUE)
bamdir <- bamdir[file.info(bamdir)$isdir]
bamfiles <- paste0(bamdir, "/Aligned.out.bam")
names(bamfiles) <- basename(bamdir)
fc_bottomly_file_fracmm <- paste0(basedir, "/quantifications/bottomly/featureCounts/fc_bottomly_fracMM.Rdata")
if (!file.exists(fc_bottomly_file_fracmm)) {
fc_bottomly_fracmm <- Rsubread::featureCounts(
files = bamfiles,
annot.ext = gtf,
isGTFAnnotationFile = TRUE,
GTF.featureType = "exon",
GTF.attrType = "gene_id",
useMetaFeatures = TRUE,
isPairedEnd = FALSE, nthreads = 5,
strandSpecific = 0, countMultiMappingReads = TRUE,
fraction = TRUE
)
colnames(fc_bottomly_fracmm$counts) <- sapply(colnames(fc_bottomly_fracmm$counts), function(w) {
s <- strsplit(w, "\\.")[[1]]
s[grep("SRR", s)]
})
colnames(fc_bottomly_fracmm$stat) <- sapply(colnames(fc_bottomly_fracmm$stat), function(w) {
s <- strsplit(w, "\\.")[[1]]
s[grep("SRR", s)]
})
save(fc_bottomly_fracmm, file = fc_bottomly_file_fracmm)
} else {
message("Loading previously saved file with featureCounts quantifications.")
load(fc_bottomly_file_fracmm)
}
## Loading previously saved file with featureCounts quantifications.
fqs <- list.files(paste0(basedir, "/data/bottomly/fastq"), full.names = TRUE)
names(fqs) <- gsub("\\.fastq.bz2", "", basename(fqs))
for (i in 1:length(fqs)) {
if (!file.exists(paste0(salmon_basedir, "/", names(fqs)[i], "/quant.sf"))) {
cmd <- sprintf("bash -c 'salmon quant -i %s -l U -r %s -p 5 -o %s --numBootstraps=30'",
salmon_index,
paste0("<(bzcat ", fqs[i], ")"),
paste0(salmon_basedir, "/", names(fqs)[i]))
cat(cmd, "\n")
system(cmd)
} else {
cat("Salmon results for", names(fqs)[i], "already exist.\n")
}
}
## Salmon results for SRR099223 already exist.
## Salmon results for SRR099224 already exist.
## Salmon results for SRR099225 already exist.
## Salmon results for SRR099226 already exist.
## Salmon results for SRR099227 already exist.
## Salmon results for SRR099228 already exist.
## Salmon results for SRR099229 already exist.
## Salmon results for SRR099230 already exist.
## Salmon results for SRR099231 already exist.
## Salmon results for SRR099232 already exist.
## Salmon results for SRR099233 already exist.
## Salmon results for SRR099234 already exist.
## Salmon results for SRR099235 already exist.
## Salmon results for SRR099236 already exist.
## Salmon results for SRR099237 already exist.
## Salmon results for SRR099238 already exist.
## Salmon results for SRR099239 already exist.
## Salmon results for SRR099240 already exist.
## Salmon results for SRR099241 already exist.
## Salmon results for SRR099242 already exist.
## Salmon results for SRR099243 already exist.
We use the tximport
package (https://github.com/mikelove/tximport) to generate count matrices and offset matrices (average transcript lengths) from the Salmon transcript-level estimates. We generate two different count matrices (simplesum and scaledTPM), and additionally create offsets to be used with the simplesum matrix.
salmon_files <- list.files(salmon_basedir, pattern = "SRR", full.names = TRUE)
salmon_files <- salmon_files[file.info(salmon_files)$isdir]
salmon_files <- paste0(salmon_files, "/quant.sf")
salmon_files <- salmon_files[file.exists(salmon_files)]
names(salmon_files) <- basename(gsub("/quant.sf", "", salmon_files))
txi_salmonsimplesum <- tximport(files = salmon_files, type = "salmon", txIn = TRUE,
txOut = FALSE, countsFromAbundance = "no",
gene2tx = gene2tx)
## reading in files
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
## summarizing abundance
## summarizing counts
## summarizing length
txi_salmonscaledtpm <- tximport(files = salmon_files, type = "salmon", txIn = TRUE,
txOut = FALSE, countsFromAbundance = "scaledTPM",
gene2tx = gene2tx)
## reading in files
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
## summarizing abundance
## summarizing counts
## summarizing length
txi_salmontx <- tximport(files = salmon_files, type = "salmon", txIn = TRUE,
txOut = TRUE, countsFromAbundance = "no", gene2tx = gene2tx)
## reading in files
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
salmon_quant <- list(geneCOUNT_sal_simplesum = txi_salmonsimplesum$counts,
geneCOUNT_sal_scaledTPM = txi_salmonscaledtpm$counts,
avetxlength = txi_salmonsimplesum$length,
geneTPM_sal = txi_salmonsimplesum$abundance,
txTPM_sal = txi_salmontx$abundance,
txCOUNT_sal = txi_salmontx$counts,
txi_salmonsimplesum = txi_salmonsimplesum,
txi_salmonscaledtpm = txi_salmonscaledtpm,
txi_salmontx = txi_salmontx)
## Transcript
salmon_dirs <- list.files(salmon_basedir, pattern = "SRR", full.names = TRUE)
salmon_dirs <- salmon_dirs[file.info(salmon_dirs)$isdir]
names(salmon_dirs) <- basename(salmon_dirs)
if (!file.exists(cv_salmon_file)) {
CV_tx_salmon <- as.data.frame(sapply(salmon_dirs, function(w) {
tmpdf <- as.data.frame(t(read.delim(paste0(w, "/quant_bootstraps.sf"),
skip = 10, header = TRUE, as.is = TRUE)))
tmpcv <- apply(tmpdf, 1, sd)/apply(tmpdf, 1, mean)
tmpcv[is.na(tmpcv)] <- 0
names(tmpcv) <- rownames(tmpdf)
tmpcv
}))
CV_tx_salmon$mean_cv <- apply(CV_tx_salmon[, names(salmon_dirs)], 1, mean)
CV_tx_salmon$min_cv <- apply(CV_tx_salmon[, names(salmon_dirs)], 1, min)
CV_tx_salmon$max_cv <- apply(CV_tx_salmon[, names(salmon_dirs)], 1, max)
CV_tx_salmon$median_cv <- apply(CV_tx_salmon[, names(salmon_dirs)], 1, median)
## Gene
CV_gene_salmon <- as.data.frame(sapply(salmon_dirs, function(w) {
tmpdf <- as.data.frame(t(read.delim(paste0(w, "/quant_bootstraps.sf"),
skip = 10, header = TRUE, as.is = TRUE)))
tmpdf$gene <- tx2gene$gene[match(rownames(tmpdf), tx2gene$transcript)]
tmpdf <- tmpdf %>% group_by(gene) %>% summarise_each(funs(sum)) %>% as.data.frame
rownames(tmpdf) <- tmpdf$gene
tmpdf$gene <- NULL
tmpcv <- apply(tmpdf, 1, sd)/apply(tmpdf, 1, mean)
tmpcv[is.na(tmpcv)] <- 0
names(tmpcv) <- rownames(tmpdf)
tmpcv
}))
CV_gene_salmon$mean_cv <- apply(CV_gene_salmon[, names(salmon_dirs)], 1, mean)
CV_gene_salmon$min_cv <- apply(CV_gene_salmon[, names(salmon_dirs)], 1, min)
CV_gene_salmon$max_cv <- apply(CV_gene_salmon[, names(salmon_dirs)], 1, max)
CV_gene_salmon$median_cv <- apply(CV_gene_salmon[, names(salmon_dirs)], 1, median)
save(CV_gene_salmon, CV_tx_salmon, file = cv_salmon_file)
} else {
message("Salmon CVs already exist.")
}
## Salmon CVs already exist.
load(cv_salmon_file)
cvs_sal <- data.frame(cvs = c(CV_tx_salmon$SRR099223, CV_gene_salmon$SRR099223),
lev = rep(c("transcript", "gene"), c(nrow(CV_tx_salmon), nrow(CV_gene_salmon))))
ggplot(cvs_sal, aes(x = log10(cvs + 0.001), col = lev)) +
geom_line(stat = "density", size = 2) +
ggtitle("") +
xlab("log10(coefficient of variation + 0.001)") + ylab("density") +
scale_colour_manual(values = c("red", "blue"), name = "") +
plot_theme()
load(fc_bottomly_file)
load(fc_bottomly_file_fracmm)
geneCOUNT_fc <- fc_bottomly$counts[match(rownames(salmon_quant$geneCOUNT_sal_simplesum),
rownames(fc_bottomly$counts)), ]
geneCOUNT_fcmm <- fc_bottomly_fracmm$counts[match(rownames(salmon_quant$geneCOUNT_sal_simplesum),
rownames(fc_bottomly_fracmm$counts)), ]
stopifnot(all(rownames(salmon_quant$geneCOUNT_sal_simplesum) ==
rownames(salmon_quant$geneCOUNT_sal_scaledTPM)))
SRR099223 <- cbind(simplesum_salmon = salmon_quant$geneCOUNT_sal_simplesum[, "SRR099223"],
scaledTPM_salmon = salmon_quant$geneCOUNT_sal_scaledTPM[, "SRR099223"],
featureCounts = geneCOUNT_fc[, "SRR099223"],
featureCountsMM = geneCOUNT_fcmm[, "SRR099223"])
rownames(SRR099223) <- rownames(salmon_quant$geneCOUNT_sal_simplesum)
SRR099223 <- log2(SRR099223 + 1)
pairs(SRR099223, upper.panel = panel_smooth, lower.panel = panel_cor)
pairs(SRR099223[which(rowSums(SRR099223 <= 0) == 0), ], upper.panel = panel_smooth,
lower.panel = panel_cor)
idx <- rownames(SRR099223)[order(abs(SRR099223[, "simplesum_salmon"] -
SRR099223[, "scaledTPM_salmon"]), decreasing = TRUE)[1:5]]
2^SRR099223[idx, ] - 1
## simplesum_salmon scaledTPM_salmon featureCounts featureCountsMM
## ENSMUSG00000064356 1208.9800 60104.854 0 387.50
## ENSMUSG00000106666 38.4926 1913.670 0 29.00
## ENSMUSG00000070834 28.5044 1417.112 0 14.50
## ENSMUSG00000082127 51.9810 2544.111 0 1.83
## ENSMUSG00000074846 23.5224 1169.421 0 0.00
subset(tx2gene, gene %in% idx)
## transcript gene
## 74708 ENSMUST00000082407 ENSMUSG00000064356
## 78060 ENSMUST00000118674 ENSMUSG00000070834
## 80558 ENSMUST00000188689 ENSMUSG00000074846
## 85443 ENSMUST00000121365 ENSMUSG00000082127
## 110140 ENSMUST00000199590 ENSMUSG00000106666
Given the gene count matrices defined above (corresponding to the featureCounts, simplesum and scaledTPM matrices mentioned in the manuscript), we apply edgeR (see below for DESeq2 results) to perform differential gene expression. For the simplesum and featureCounts matrices, we also perform the analyses using the offsets derived from the average transcript lengths (simplesum_avetxl).
res_sal_simplesum_edgeR <- diff_expression_edgeR(counts = salmon_quant$geneCOUNT_sal_simplesum,
meta = meta, cond_name = "strain",
sample_name = "srr.id",
gene_length_matrix = NULL)
res_sal_simplesum_avetxl_edgeR <- diff_expression_edgeR(counts = salmon_quant$geneCOUNT_sal_simplesum,
meta = meta, cond_name = "strain",
sample_name = "srr.id",
gene_length_matrix = salmon_quant$avetxlength)
res_sal_scaledTPM_edgeR <- diff_expression_edgeR(counts = salmon_quant$geneCOUNT_sal_scaledTPM,
meta = meta, cond_name = "strain",
sample_name = "srr.id",
gene_length_matrix = NULL)
res_fc_edgeR <- diff_expression_edgeR(counts = geneCOUNT_fc,
meta = meta, cond_name = "strain",
sample_name = "srr.id",
gene_length_matrix = NULL)
res_fc_avetxl_edgeR <- diff_expression_edgeR(counts = geneCOUNT_fc,
meta = meta, cond_name = "strain",
sample_name = "srr.id",
gene_length_matrix = salmon_quant$avetxlength)
dfh <- data.frame(pvalue = c(res_fc_edgeR$tt$PValue,
res_fc_avetxl_edgeR$tt$PValue,
res_sal_scaledTPM_edgeR$tt$PValue,
res_sal_simplesum_edgeR$tt$PValue,
res_sal_simplesum_avetxl_edgeR$tt$PValue),
mth = c(rep("featureCounts", nrow(res_fc_edgeR$tt)),
rep("featureCounts_avetxl", nrow(res_fc_avetxl_edgeR$tt)),
rep("scaledTPM_salmon", nrow(res_sal_scaledTPM_edgeR$tt)),
rep("simplesum_salmon", nrow(res_sal_simplesum_edgeR$tt)),
rep("simplesum_salmon_avetxl", nrow(res_sal_simplesum_avetxl_edgeR$tt))))
ggplot(dfh, aes(x = pvalue)) + geom_histogram() + facet_wrap(~mth) +
plot_theme() +
xlab("p-value") + ylab("count")
par(mfrow = c(2, 3))
plotBCV(res_fc_edgeR$dge, main = "featureCounts")
plotBCV(res_fc_avetxl_edgeR$dge, main = "featureCounts_avetxl")
plotBCV(res_sal_scaledTPM_edgeR$dge, main = "scaledTPM_salmon")
plotBCV(res_sal_simplesum_edgeR$dge, main = "simplesum_salmon")
plotBCV(res_sal_simplesum_avetxl_edgeR$dge, main = "simplesum_salmon_avetxl")
par(mfrow = c(2, 3))
plotSmear(res_fc_edgeR$dge, main = "featureCounts", ylim = c(-18, 18))
plotSmear(res_fc_avetxl_edgeR$dge, main = "featureCounts_avetxl", ylim = c(-18, 18))
plotSmear(res_sal_scaledTPM_edgeR$dge, main = "scaledTPM_salmon", ylim = c(-18, 18))
plotSmear(res_sal_simplesum_edgeR$dge, main = "simplesum_salmon", ylim = c(-18, 18))
plotSmear(res_sal_simplesum_avetxl_edgeR$dge, main = "simplesum_salmon_avetxl", ylim = c(-18, 18))
par(mfrow = c(1, 1))
cobra_edgeR <- COBRAData(padj = data.frame(simplesum_salmon = res_sal_simplesum_edgeR$tt$FDR,
row.names = rownames(res_sal_simplesum_edgeR$tt)))
cobra_edgeR <- COBRAData(padj = data.frame(scaledTPM_salmon = res_sal_scaledTPM_edgeR$tt$FDR,
row.names = rownames(res_sal_scaledTPM_edgeR$tt)),
object_to_extend = cobra_edgeR)
cobra_edgeR <- COBRAData(padj = data.frame(simplesum_salmon_avetxl = res_sal_simplesum_avetxl_edgeR$tt$FDR,
row.names = rownames(res_sal_simplesum_avetxl_edgeR$tt)),
object_to_extend = cobra_edgeR)
cobra_edgeR <- COBRAData(padj = data.frame(featureCounts = res_fc_edgeR$tt$FDR,
row.names = rownames(res_fc_edgeR$tt)),
object_to_extend = cobra_edgeR)
cobra_edgeR <- COBRAData(padj = data.frame(featureCounts_avetxl = res_fc_avetxl_edgeR$tt$FDR,
row.names = rownames(res_fc_avetxl_edgeR$tt)),
object_to_extend = cobra_edgeR)
cobraperf_edgeR <- calculate_performance(cobra_edgeR, aspects = "overlap", thr_venn = 0.05)
cobraplot1_edgeR <- prepare_data_for_plot(cobraperf_edgeR, incltruth = FALSE,
colorscheme = c("blue", "cyan", "black", "green", "red"))
plot_overlap(cobraplot1_edgeR, cex = c(1, 0.7, 0.7))
df1 <- Reduce(function(...) merge(..., by = "gene", all = TRUE),
list(data.frame(gene = rownames(res_sal_scaledTPM_edgeR$tt),
scaledTPM_salmon = res_sal_scaledTPM_edgeR$tt$logFC,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_edgeR$tt),
simplesum_salmon = res_sal_simplesum_edgeR$tt$logFC,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_avetxl_edgeR$tt),
simplesum_salmon_avetxl =
res_sal_simplesum_avetxl_edgeR$tt$logFC,
stringsAsFactors = FALSE)))
rownames(df1) <- df1$gene
df1$gene <- NULL
pairs(df1, upper.panel = panel_smooth, lower.panel = panel_cor)
df2 <- Reduce(function(...) merge(..., by = "gene", all = TRUE),
list(data.frame(gene = rownames(salmon_quant$geneTPM_sal),
salmon_quant$geneTPM_sal,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_scaledTPM_edgeR$tt),
scaledTPM_salmon_logFC = res_sal_scaledTPM_edgeR$tt$logFC,
scaledTPM_salmon_logCPM = res_sal_scaledTPM_edgeR$tt$logCPM,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_edgeR$tt),
simplesum_salmon_logFC = res_sal_simplesum_edgeR$tt$logFC,
simplesum_salmon_logCPM = res_sal_simplesum_edgeR$tt$logCPM,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_avetxl_edgeR$tt),
simplesum_salmon_avetxl_logFC =
res_sal_simplesum_avetxl_edgeR$tt$logFC,
simplesum_salmon_avetxl_logCPM =
res_sal_simplesum_avetxl_edgeR$tt$logCPM,
stringsAsFactors = FALSE)))
rownames(df2) <- df2$gene
df2$gene <- NULL
df2$scaledTPM_salmon_logCPMbinary <- Hmisc::cut2(df2$scaledTPM_salmon_logCPM, g = 2)
df2$simplesum_salmon_logCPMbinary <- Hmisc::cut2(df2$simplesum_salmon_logCPM, g = 2)
df2$simplesum_salmon_avetxl_logCPMbinary <- Hmisc::cut2(df2$simplesum_salmon_avetxl_logCPM, g = 2)
df2$sumA <- rowSums(df2[, meta$srr.id[meta$strain == "DBA/2J"]])
df2$sumB <- rowSums(df2[, meta$srr.id[meta$strain == "C57BL/6J"]])
df2$allzero_onecond <- "expressed in both groups"
df2$allzero_onecond[union(which(df2$sumA == 0), which(df2$sumB == 0))] <- "expressed in one group"
df2$onecol <- rep("", nrow(df2))
ggplot(df2, aes(x = simplesum_salmon_logFC, y = scaledTPM_salmon_logFC, col = onecol)) +
geom_abline(intercept = 0, slope = 1) +
geom_point(size = 2, alpha = 0.5) +
plot_theme() + ggtitle("Bottomly") + theme(legend.position = "bottom") +
scale_color_manual(values = c("blue"), name = "") +
theme(legend.background = element_rect(fill = "white"), legend.key = element_blank()) +
xlab("simplesum_salmon, logFC") + ylab("scaledTPM_salmon, logFC") +
guides(colour = guide_legend(override.aes = list(size = 0)))
ggplot(df2, aes(x = simplesum_salmon_logFC, y = scaledTPM_salmon_logFC, col = allzero_onecond)) +
geom_abline(intercept = 0, slope = 1) +
geom_point(size = 2, alpha = 0.5) +
plot_theme() + ggtitle("Bottomly") + theme(legend.position = "bottom") +
scale_color_manual(values = c("blue", "red"), name = "") +
xlab("simplesum_salmon, logFC") + ylab("scaledTPM_salmon, logFC") +
guides(colour = guide_legend(override.aes = list(size = 7)))
ggplot(subset(df2, !is.na(scaledTPM_salmon_logCPMbinary)),
aes(x = simplesum_salmon_logFC, y = scaledTPM_salmon_logFC,
col = scaledTPM_salmon_logCPMbinary)) +
geom_abline(intercept = 0, slope = 1) +
geom_point(size = 2, alpha = 0.5) +
facet_wrap(~scaledTPM_salmon_logCPMbinary) +
plot_theme() + ggtitle("Bottomly") + theme(legend.position = "bottom") +
xlab("simplesum_salmon, logFC") + ylab("scaledTPM_salmon, logFC") +
scale_color_manual(values = c("red", "blue"), name = "scaledTPM_salmon, logCPM") +
guides(colour = guide_legend(override.aes = list(size = 7)))
BPPARAM = MulticoreParam(6)
dxd <- DEXSeqDataSet(countData = round(salmon_quant$txCOUNT_sal), sampleData = meta,
design = ~sample + exon + strain:exon,
featureID = rownames(salmon_quant$txCOUNT_sal),
groupID = tx2gene$gene[match(rownames(salmon_quant$txCOUNT_sal),
tx2gene$transcript)])
dxd <- estimateSizeFactors(dxd)
dxd <- estimateDispersions(dxd, BPPARAM = BPPARAM)
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
plotDispEsts(dxd)
dxd <- testForDEU(dxd, BPPARAM = BPPARAM)
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
dxr <- DEXSeqResults(dxd)
qval_dtu_salmon <- perGeneQValue(dxr)
res_dte_salmon_tx <- diff_expression_edgeR(counts = salmon_quant$txCOUNT_sal,
meta = meta, cond_name = "strain",
sample_name = "srr.id",
gene_length_matrix = NULL)
res_dte_salmon_tx$tt$gene <- as.character(tx2gene$gene[match(rownames(res_dte_salmon_tx$tt),
tx2gene$transcript)])
## Summarise on gene level
geneSplit = split(seq(along = res_dte_salmon_tx$tt$gene), res_dte_salmon_tx$tt$gene)
pGene = sapply(geneSplit, function(i) min(res_dte_salmon_tx$tt$PValue[i]))
stopifnot(all(is.finite(pGene)))
theta = unique(sort(pGene))
q = DEXSeq:::perGeneQValueExact(pGene, theta, geneSplit)
qval_dte_salmon_gene = rep(NA_real_, length(pGene))
qval_dte_salmon_gene = q[match(pGene, theta)]
qval_dte_salmon_gene = pmin(1, qval_dte_salmon_gene)
names(qval_dte_salmon_gene) = names(geneSplit)
stopifnot(!any(is.na(qval_dte_salmon_gene)))
qval_dge <- res_sal_scaledTPM_edgeR$tt$FDR
names(qval_dge) <- rownames(res_sal_scaledTPM_edgeR$tt)
cobra <- COBRAData(padj = data.frame(DTE = qval_dte_salmon_gene, row.names = names(qval_dte_salmon_gene)))
cobra <- COBRAData(padj = data.frame(DTU = qval_dtu_salmon, row.names = names(qval_dtu_salmon)),
object_to_extend = cobra)
cobra <- COBRAData(padj = data.frame(DGE = qval_dge, row.names = names(qval_dge)),
object_to_extend = cobra)
cobraperf <- calculate_performance(cobra, aspects = "overlap", thr_venn = 0.05)
cobraplot <- prepare_data_for_plot(cobraperf)
plot_overlap(cobraplot)
res_sal_simplesum_deseq2 <- diff_expression_DESeq2(txi = NULL,
counts = salmon_quant$geneCOUNT_sal_simplesum,
meta = meta, cond_name = "strain",
level1 = "C57BL/6J", level2 = "DBA/2J",
sample_name = "srr.id")
res_sal_simplesum_avetxl_deseq2 <- diff_expression_DESeq2(txi = salmon_quant$txi_salmonsimplesum,
counts = NULL,
meta = meta, cond_name = "strain",
level1 = "C57BL/6J", level2 = "DBA/2J",
sample_name = "srr.id")
res_sal_scaledTPM_deseq2 <- diff_expression_DESeq2(txi = NULL,
counts = salmon_quant$geneCOUNT_sal_scaledTPM,
meta = meta, cond_name = "strain",
level1 = "C57BL/6J", level2 = "DBA/2J",
sample_name = "srr.id")
res_fc_deseq2 <- diff_expression_DESeq2(txi = NULL,
counts = geneCOUNT_fc,
meta = meta, cond_name = "strain",
level1 = "C57BL/6J", level2 = "DBA/2J",
sample_name = "srr.id")
res_fc_avetxl_deseq2 <-
diff_expression_DESeq2(txi = list(counts = geneCOUNT_fc,
length = salmon_quant$avetxlength[match(rownames(geneCOUNT_fc),
rownames(salmon_quant$avetxlength)),
match(colnames(geneCOUNT_fc),
colnames(salmon_quant$avetxlength))],
countsFromAbundance = "no"),
counts = NULL,
meta = meta, cond_name = "strain",
level1 = "C57BL/6J", level2 = "DBA/2J",
sample_name = "srr.id")
dfh <- data.frame(pvalue = c(res_fc_deseq2$res$pvalue,
res_fc_avetxl_deseq2$res$pvalue,
res_sal_scaledTPM_deseq2$res$pvalue,
res_sal_simplesum_deseq2$res$pvalue,
res_sal_simplesum_avetxl_deseq2$res$pvalue),
mth = c(rep("featureCounts", nrow(res_fc_deseq2$res)),
rep("featureCounts_avetxl", nrow(res_fc_avetxl_deseq2$res)),
rep("scaledTPM_salmon", nrow(res_sal_scaledTPM_deseq2$res)),
rep("simplesum_salmon", nrow(res_sal_simplesum_deseq2$res)),
rep("simplesum_salmon_avetxl", nrow(res_sal_simplesum_avetxl_deseq2$res))))
ggplot(dfh, aes(x = pvalue)) + geom_histogram() + facet_wrap(~mth) +
plot_theme() +
xlab("p-value") + ylab("count")
par(mfrow = c(2, 3))
plotDispEsts(res_fc_deseq2$dsd, main = "featureCounts")
plotDispEsts(res_fc_avetxl_deseq2$dsd, main = "featureCounts_avetxl")
plotDispEsts(res_sal_scaledTPM_deseq2$dsd, main = "scaledTPM_salmon")
plotDispEsts(res_sal_simplesum_deseq2$dsd, main = "simplesum_salmon")
plotDispEsts(res_sal_simplesum_avetxl_deseq2$dsd, main = "simplesum_salmon_avetxl")
par(mfrow = c(2, 3))
DESeq2::plotMA(res_fc_deseq2$dsd, main = "featureCounts")
DESeq2::plotMA(res_fc_avetxl_deseq2$dsd, main = "featureCounts_avetxl")
DESeq2::plotMA(res_sal_scaledTPM_deseq2$dsd, main = "scaledTPM_salmon")
DESeq2::plotMA(res_sal_simplesum_deseq2$dsd, main = "simplesum_salmon")
DESeq2::plotMA(res_sal_simplesum_avetxl_deseq2$dsd, main = "simplesum_salmon_avetxl")
par(mfrow = c(1, 1))
cobra_deseq2 <- COBRAData(padj = data.frame(simplesum_salmon = res_sal_simplesum_deseq2$res$padj,
row.names = rownames(res_sal_simplesum_deseq2$res)))
cobra_deseq2 <- COBRAData(padj = data.frame(scaledTPM_salmon = res_sal_scaledTPM_deseq2$res$padj,
row.names = rownames(res_sal_scaledTPM_deseq2$res)),
object_to_extend = cobra_deseq2)
cobra_deseq2 <- COBRAData(padj = data.frame(simplesum_salmon_avetxl = res_sal_simplesum_avetxl_deseq2$res$padj,
row.names = rownames(res_sal_simplesum_avetxl_deseq2$res)),
object_to_extend = cobra_deseq2)
cobra_deseq2 <- COBRAData(padj = data.frame(featureCounts = res_fc_deseq2$res$padj,
row.names = rownames(res_fc_deseq2$res)),
object_to_extend = cobra_deseq2)
cobra_deseq2 <- COBRAData(padj = data.frame(featureCounts_avetxl = res_fc_avetxl_deseq2$res$padj,
row.names = rownames(res_fc_avetxl_deseq2$res)),
object_to_extend = cobra_deseq2)
cobraperf_deseq2 <- calculate_performance(cobra_deseq2, aspects = "overlap", thr_venn = 0.05)
cobraplot1_deseq2 <- prepare_data_for_plot(cobraperf_deseq2, incltruth = FALSE,
colorscheme = c("blue", "cyan", "black", "green", "red"))
plot_overlap(cobraplot1_deseq2, cex = c(1, 0.7, 0.7))
df1 <- Reduce(function(...) merge(..., by = "gene", all = TRUE),
list(data.frame(gene = rownames(res_sal_scaledTPM_deseq2$res),
scaledTPM_salmon = res_sal_scaledTPM_deseq2$res$log2FoldChange,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_deseq2$res),
simplesum_salmon = res_sal_simplesum_deseq2$res$log2FoldChange,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_avetxl_deseq2$res),
simplesum_salmon_avetxl =
res_sal_simplesum_avetxl_deseq2$res$log2FoldChange,
stringsAsFactors = FALSE)))
rownames(df1) <- df1$gene
df1$gene <- NULL
pairs(df1, upper.panel = panel_smooth, lower.panel = panel_cor)
df2 <- Reduce(function(...) merge(..., by = "gene", all = TRUE),
list(data.frame(gene = rownames(salmon_quant$geneTPM_sal),
salmon_quant$geneTPM_sal,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_scaledTPM_deseq2$res),
scaledTPM_salmon_logFC = res_sal_scaledTPM_deseq2$res$log2FoldChange,
scaledTPM_salmon_basemean = res_sal_scaledTPM_deseq2$res$baseMean,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_deseq2$res),
simplesum_salmon_logFC = res_sal_simplesum_deseq2$res$log2FoldChange,
simplesum_salmon_basemean = res_sal_simplesum_deseq2$res$baseMean,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_avetxl_deseq2$res),
simplesum_salmon_avetxl_logFC =
res_sal_simplesum_avetxl_deseq2$res$log2FoldChange,
simplesum_salmon_avetxl_basemean =
res_sal_simplesum_avetxl_deseq2$res$baseMean,
stringsAsFactors = FALSE)))
rownames(df2) <- df2$gene
df2$gene <- NULL
df2$scaledTPM_salmon_basemeanbinary <- Hmisc::cut2(df2$scaledTPM_salmon_basemean, g = 2)
df2$simplesum_salmon_basemeanbinary <- Hmisc::cut2(df2$simplesum_salmon_basemean, g = 2)
df2$simplesum_salmon_avetxl_basemeanbinary <- Hmisc::cut2(df2$simplesum_salmon_avetxl_basemean, g = 2)
df2$sumA <- rowSums(df2[, meta$srr.id[meta$strain == "DBA/2J"]])
df2$sumB <- rowSums(df2[, meta$srr.id[meta$strain == "C57BL/6J"]])
df2$allzero_onecond <- "expressed in both groups"
df2$allzero_onecond[union(which(df2$sumA == 0), which(df2$sumB == 0))] <- "expressed in one group"
df2$onecol <- rep("", nrow(df2))
ggplot(df2, aes(x = simplesum_salmon_logFC, y = scaledTPM_salmon_logFC, col = onecol)) +
geom_abline(intercept = 0, slope = 1) +
geom_point(size = 2, alpha = 0.5) +
plot_theme() + ggtitle("Bottomly") + theme(legend.position = "bottom") +
scale_color_manual(values = c("blue"), name = "") +
theme(legend.background = element_rect(fill = "white"), legend.key = element_blank()) +
xlab("simplesum_salmon, logFC") + ylab("scaledTPM_salmon, logFC") +
guides(colour = guide_legend(override.aes = list(size = 0)))
ggplot(df2, aes(x = simplesum_salmon_logFC, y = scaledTPM_salmon_logFC, col = allzero_onecond)) +
geom_abline(intercept = 0, slope = 1) +
geom_point(size = 2, alpha = 0.5) +
plot_theme() + ggtitle("Bottomly") + theme(legend.position = "bottom") +
scale_color_manual(values = c("blue", "red"), name = "") +
xlab("simplesum_salmon, logFC") + ylab("scaledTPM_salmon, logFC") +
guides(colour = guide_legend(override.aes = list(size = 7)))
ggplot(subset(df2, !is.na(scaledTPM_salmon_basemeanbinary)),
aes(x = simplesum_salmon_logFC, y = scaledTPM_salmon_logFC,
col = scaledTPM_salmon_basemeanbinary)) +
geom_abline(intercept = 0, slope = 1) +
geom_point(size = 2, alpha = 0.5) +
facet_wrap(~scaledTPM_salmon_basemeanbinary) +
plot_theme() + ggtitle("Bottomly") + theme(legend.position = "bottom") +
xlab("simplesum_salmon, logFC") + ylab("scaledTPM_salmon, logFC") +
scale_color_manual(values = c("red", "blue"), name = "scaledTPM_salmon, base mean") +
guides(colour = guide_legend(override.aes = list(size = 7)))
panel_cor <- function(x, y, digits = 3, cex.cor) {
## Panel function to print Pearson and Spearman correlations
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r1 <- abs(cor(x, y, method = "pearson", use = "complete"))
txt1 <- format(c(r1, 0.123456789), digits = digits)[1]
r2 <- abs(cor(x, y, method = "spearman", use = "complete"))
txt2 <- format(c(r2, 0.123456789), digits = digits)[1]
text(0.5, 0.35, paste("pearson =", txt1), cex = 1.1)
text(0.5, 0.65, paste("spearman =", txt2), cex = 1.1)
}
panel_smooth<-function (x, y, col = "blue", bg = NA, pch = ".",
cex = 0.8, ...) {
## Panel function to plot points
points(x, y, pch = pch, col = col, bg = bg, cex = cex)
}
plot_theme <- function() {
## ggplot2 plotting theme
theme_grey() +
theme(legend.position = "right",
panel.background = element_rect(fill = "white", colour = "black"),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
strip.text = element_text(size = 10),
strip.background = element_rect(fill = NA, colour = "black"),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
plot.title = element_text(colour = "black", size = 20))
}
calc_lengths_mapping <- function(gtf, cdna_fasta, feature_lengths_file,
tx_gene_file) {
## Function to calculate gene and transcript lengths from transcript cDNA
## fasta and gtf file. Also generate mapping between transcript and gene IDs.
suppressPackageStartupMessages(library(GenomicFeatures))
suppressPackageStartupMessages(library(Biostrings))
## Gene/transcript lengths ===============================================
## Transcripts/genes present in gtf file
if (!is.null(gtf)) {
txdb <- makeTxDbFromGFF(gtf, format = "gtf")
ebg <- exonsBy(txdb, "gene")
ebt <- exonsBy(txdb, "tx", use.names = TRUE)
ebg_red <- reduce(ebg)
gene_length <- sum(width(ebg_red))
ebt_red <- reduce(ebt)
tx_length <- sum(width(ebt_red))
} else {
tx_length <- c()
gene_length <- c()
}
## Extend with transcripts from cDNA fasta
cdna <- readDNAStringSet(gsub("\\.gz", "", cdna_fasta))
tx_length2 <- width(cdna)
names(tx_length2) <- sapply(names(cdna), function(i) strsplit(i, " ")[[1]][1])
tx_length2 <- tx_length2[setdiff(names(tx_length2), names(tx_length))]
tx_length <- c(tx_length, tx_length2)
## Gene/transcript mapping ===============================================
## Transcripts/genes present in gtf file
if (!is.null(gtf)) {
tbg <- transcriptsBy(txdb, "gene")
tx2gene <- stack(lapply(tbg, function(w) w$tx_name))
colnames(tx2gene) <- c("transcript", "gene")
} else {
tx2gene <- data.frame(transcript = c(), gene = c())
}
## Extend with mappings from cDNA fasta file
tx <- sapply(names(cdna), function(i) strsplit(i, " ")[[1]][1])
gn <- sapply(names(cdna), function(i) gsub("gene:", "", strsplit(i, " ")[[1]][4]))
tx2gene2 <- data.frame(transcript = tx, gene = gn,
stringsAsFactors = FALSE)
rownames(tx2gene2) <- NULL
tx2gene2 <- tx2gene2[match(setdiff(tx2gene2$transcript,
tx2gene$transcript), tx2gene2$transcript), ]
tx2gene <- rbind(tx2gene, tx2gene2)
gene2tx <- tx2gene[, c("gene", "transcript")]
save(gene_length, tx_length, file = feature_lengths_file)
save(gene2tx, tx2gene, file = tx_gene_file)
}
diff_expression_edgeR <- function(counts, meta, cond_name, sample_name,
gene_length_matrix = NULL) {
## Differential expression analysis with edgeR
suppressPackageStartupMessages(library(edgeR))
## Prepare DGEList
counts <- round(counts)
cts <- counts[rowSums(is.na(counts)) == 0, ]
cts <- cts[rowSums(cts) != 0, ]
dge <-
DGEList(counts = cts, group = meta[, cond_name][match(colnames(cts),
meta[, sample_name])])
## If average transcript lengths provided, add as offset
if (!is.null(gene_length_matrix)) {
egf <- gene_length_matrix[match(rownames(cts), rownames(gene_length_matrix)),
match(colnames(cts), colnames(gene_length_matrix))]
egf <- egf / exp(rowMeans(log(egf)))
o <- log(calcNormFactors(cts/egf)) + log(colSums(cts/egf))
dge$offset <- t(t(log(egf)) + o)
} else {
dge <- calcNormFactors(dge)
}
## Estimate dispersions and fit model
design <- model.matrix(~dge$samples$group)
dge <- estimateGLMCommonDisp(dge, design = design)
dge <- estimateGLMTrendedDisp(dge, design = design)
dge <- estimateGLMTagwiseDisp(dge, design = design)
fit <- glmFit(dge, design = design)
lrt <- glmLRT(fit)
tt <- topTags(lrt, n = Inf)$table
return(list(dge = dge, tt = tt))
}
diff_expression_DESeq2 <- function(txi = NULL, counts, meta, cond_name,
level1, level2, sample_name) {
## Differential expression analysis with DESeq2
suppressPackageStartupMessages(library(DESeq2,
lib.loc = "/home/charlotte/R/x86_64-pc-linux-gnu-library/3.2"))
## If tximport object provided, generate DESeqDataSet from it. Otherwise,
## use the provided count matrix.
if (!is.null(txi)) {
txi$counts <- round(txi$counts)
keep_feat <- rownames(txi$counts[rowSums(is.na(txi$counts)) == 0 & rowSums(txi$counts) != 0, ])
txi <- lapply(txi, function(w) {
if (!is.null(dim(w))) w[match(keep_feat, rownames(w)), ]
else w
})
dsd <- DESeqDataSetFromTximport(txi,
colData = meta[match(colnames(txi$counts),
meta[, sample_name]), ],
design = as.formula(paste0("~", cond_name)))
} else {
counts <- round(counts)
cts = counts[rowSums(is.na(counts)) == 0, ]
cts <- cts[rowSums(cts) != 0, ]
dsd <- DESeqDataSetFromMatrix(countData = round(cts),
colData = meta[match(colnames(cts),
meta[, sample_name]), ],
design = as.formula(paste0("~", cond_name)))
}
## Estimate dispersions and fit model
dsd <- DESeq(dsd, test = "Wald", fitType = "local", betaPrior = TRUE)
res <- as.data.frame(results(dsd, contrast = c(cond_name, level2, level1),
cooksCutoff = FALSE, independentFiltering = FALSE))
return(list(dsd = dsd, res = res))
}
sessionInfo()
## R version 3.2.2 (2015-08-14)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.3 LTS
##
## locale:
## [1] LC_CTYPE=C LC_NUMERIC=C LC_TIME=en_CA.UTF-8
## [4] LC_COLLATE=en_CA.UTF-8 LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_CA.UTF-8
## [7] LC_PAPER=en_CA.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] edgeR_3.12.0 limma_3.26.3 DEXSeq_1.16.2
## [4] DESeq2_1.11.6 RcppArmadillo_0.6.200.2.0 Rcpp_0.12.2
## [7] SummarizedExperiment_1.0.1 Biobase_2.30.0 GenomicRanges_1.22.1
## [10] GenomeInfoDb_1.6.1 IRanges_2.4.4 S4Vectors_0.8.3
## [13] BiocGenerics_0.16.1 BiocParallel_1.4.0 dplyr_0.4.3
## [16] ggplot2_1.0.1 iCOBRA_0.99.3 tximport_0.0.7
## [19] Rsubread_1.20.2
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 RColorBrewer_1.1-2 tools_3.2.2 R6_2.1.1
## [5] DT_0.1 rpart_4.1-10 KernSmooth_2.23-15 Hmisc_3.17-0
## [9] DBI_0.3.1 colorspace_1.2-6 nnet_7.3-10 gridExtra_2.0.0
## [13] formatR_1.2.1 labeling_0.3 caTools_1.17.1 scales_0.3.0
## [17] genefilter_1.52.0 stringr_1.0.0 digest_0.6.8 Rsamtools_1.22.0
## [21] shinyBS_0.61 foreign_0.8-65 rmarkdown_0.8.1 XVector_0.10.0
## [25] htmltools_0.2.6 htmlwidgets_0.5 RSQLite_1.0.0 shiny_0.12.2
## [29] hwriter_1.3.2 gtools_3.5.0 acepack_1.3-3.3 RCurl_1.95-4.7
## [33] magrittr_1.5 Formula_1.2-1 futile.logger_1.4.1 munsell_0.4.2
## [37] proto_0.3-10 stringi_1.0-1 yaml_2.1.13 MASS_7.3-43
## [41] zlibbioc_1.16.0 gplots_2.17.0 plyr_1.8.3 grid_3.2.2
## [45] gdata_2.17.0 shinydashboard_0.5.1 lattice_0.20-33 Biostrings_2.38.2
## [49] splines_3.2.2 annotate_1.48.0 locfit_1.5-9.1 knitr_1.11
## [53] geneplotter_1.48.0 reshape2_1.4.1 biomaRt_2.26.0 futile.options_1.0.0
## [57] XML_3.98-1.3 evaluate_0.8 latticeExtra_0.6-26 lambda.r_1.1.7
## [61] httpuv_1.3.3 gtable_0.1.2 assertthat_0.1 mime_0.4
## [65] xtable_1.8-0 survival_2.38-3 AnnotationDbi_1.32.0 cluster_2.0.3
## [69] statmod_1.4.22 ROCR_1.0-7