project_folders <- function() {
base::dir.create("00_raw_data")
base::dir.create("01_analysis")
base::dir.create("02_tidy_data")
base::dir.create("03_r_scripts")
base::dir.create("04_plots")
base::dir.create("99_results")
}
# project_folders()RNASeq Analysis
This is a mock analysis
For this analysis try to keep a estructured folder tree in the same directory. Something like this:
You can follow your own system but keep a logical and sequential order. For this tutorial we will follow this convention using Snakemake inside the analysis folder.
This tutorial is a reproducible implementation from: Research Technology Bioinformatics Tools for Life Science RNAseq tutorial to make it ‘reproducible’.
Libraries
Initiate renv:
CRAN Packages installation on renv
Bioconductor missing dependency to handle Kallisto h5 files.
Usually renv::init() creates a renv.lock file but if something failes it will be necessary to take a renv snapshot manually:
renv::snapshot()Loading all the packages, however, to keep track of each function a syntax library::function() will be used in the follow code.
Loading data
Don’t forget to use here, it helps a lot with here::here()
Loading analysis results
metadata <- readr::read_table(here::here("analysis/metadata.tsv"))
kallisto_dir <-"analysis/rnaseqSE_snakemake/02_mapping/kallisto"
lsdir <- list.dirs(here::here(kallisto_dir), recursive = FALSE)
kallisto_files <- paste0(lsdir, "/abundance.h5")ensembl <- biomaRt::useMart(biomart = "ensembl",
dataset = "scerevisiae_gene_ensembl")
t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id",
"ensembl_gene_id",
"external_gene_name"),
mart = ensembl)
txi_kallisto <- tximport::tximport(kallisto_files,
type = "kallisto",
tx2gene = t2g,
ignoreTxVersion = TRUE) dds <- DESeq2::DESeqDataSetFromTximport(txi_kallisto, colData=metadata, design = ~ condition)
dds <- DESeq2::DESeq(dds)PCA analysis
rld <- DESeq2::rlog(dds, blind=TRUE)
DESeq2::plotPCA(rld, intgroup="condition") + ggplot2::geom_text(ggplot2::aes(label=name))Creating contrasts and running a Wald test
contrast <- c("condition", "SNF2", "WT")
res_unshrunken <- DESeq2::results(dds, contrast=contrast)
DESeq2::summary(res_unshrunken)Shrinkage of the log2 fold changes
res <- DESeq2::lfcShrink(dds, contrast=contrast, res=res_unshrunken, type="normal")
DESeq2::summary(res)
head(res)Filtering to find significant genes
padj_cutoff <- 0.05 # False Discovery Rate cutoff
significant_results <- res[which(res$padj < padj_cutoff), ]save results using customized file_name
write.table(significant_results,
'analysis/significant_padj_0.05.txt',
quote=FALSE)Visualization
simple plot for a single gene YOR290C (SNF2)
plotCounts(dds, gene="YOR290C", intgroup="condition")Plot multiple genes in a heatmap: extract the counts from the rlog transformed object and select by row name using the list of genes:
significant_results_sorted <- significant_results[order(significant_results$padj), ]
significant_genes_25 <- rownames(significant_results_sorted[1:25, ])
rld_counts <- SummarizedExperiment::assay(rld)
rld_counts_sig <- rld_counts[significant_genes_25, ]
colnames(rld_counts_sig) <- metadata$sample
annotation <- as.data.frame(metadata)
rownames(annotation) <- annotation$sample
annotation$sample <- NULL
pheatmap::pheatmap(rld_counts_sig,
cluster_rows = T,
show_rownames = F,
annotation_col = annotation,
border_color = NA,
fontsize = 10,
scale = "row",
fontsize_row = 8,
height = 20)load previously saved result
significant_results_test <- read.table("analysis/significant_padj_0.05.txt",
header=TRUE, row.names = 1)volcano plot
Add another column in the results table to label the significant genes using threshold of padj<0.05 and absolute value of log2foldchange >=1
res_table <- res %>%
data.frame() %>%
tibble::rownames_to_column(var="gene") %>%
tibble::as_tibble()
res_table <- res_table %>%
dplyr::mutate(threshold_OE = padj < 0.05 & abs(log2FoldChange) >= 1)
# you can view the modified table
head(res_table)make volcano plot, the significant genes will be labeled in red
ggplot(res_table) +
geom_point(aes(x = log2FoldChange,
y = -log10(padj),
colour = threshold_OE)) +
scale_color_manual(values=c("black", "red")) + # black v.s. red dots
ggtitle("SNF2 against WT") + # this line defines the title of the plot
xlab("log2 fold change") + # this line defines the name of the x-axis
ylab("-log10 adjusted p-value") + # name of y-axis
scale_x_continuous(limits = c(-7.5,7.5)) + # the axis range is set to be from -7.5 to 7.5
theme(legend.position = "none", #c(0.9, 0.9),
plot.title = element_text(size = rel(1.5), hjust = 0.5),
axis.title = element_text(size = rel(1.25))) Functional analysis using clusterprofiler
Run GO enrichment analysis for the top 500 genes
significant_results_sorted <- res[order(res$padj), ]
significant_genes_500 <- rownames(significant_results_sorted[1:500, ])
ego <- clusterProfiler::enrichGO(gene = significant_genes_500,
keyType = "ENSEMBL",
OrgDb = org.Sc.sgd.db::org.Sc.sgd.db)Output results from GO analysis to a table
cluster_summary <- data.frame(ego)Dotplot for the top 50 genes
clusterProfiler::dotplot(ego, showCategory=10)Enrichmap clusters the 50 most significant (by padj) GO terms to visualize relationships between terms
d <- GOSemSim::godata("org.Sc.sgd.db", ont = "BP")
compare_cluster_GO_emap <- enrichplot::pairwise_termsim(ego,
semData = d,
method="Wang")
clusterProfiler::emapplot(compare_cluster_GO_emap, showCategory = 10)