Chapter 2 Pipeline
2.1 software install
There are some required software that you can install with conda
conda create -n salmon salmon
conda activate salmon
conda install trimmomatic
conda install multiqc
2.2 from fastq files to alignments
2.2.1 Quality control of FASTQ files
make directory
cd workspace
mkdir datasets
go to workspace/
, fastq files is in datasets/
, the test data can be found at /share/work/HPC/work_tmp/PipelineJob_141934_20230925/data
mkdir fastqc
for filename in datasets/*.fastq.gz
do
fastqc -o fastqc $filename
done
multiqc fastqc/*
go to fastqc
, open multiqc_report.html
in browser, check whether the data is normal
2.2.2 Trimming
make directory
mkdir trim
cd trim
mkdir qc
cd ../../
make filenames list, each row represent a sample, the test data can be found at /share/work/HPC/work_tmp/PipelineJob_141934_20230925/data/raw_filenames.txt
head filenames.txt
sample01
sample02
sample03
trimming
cat filenames.txt | parallel -j 6 --results logs/trimmomatic/ --joblog logs/trimmomatic.log \
trimmomatic PE -threads 4 \
-phred33 \
-trimlog trim/qc/{}.log \
datasets/{}_1.fastq.gz datasets/{}_2.fastq.gz \
trim/qc/{}_1_paired.fastq trim/qc/{}_1_unpaired.fastq \
trim/qc/{}_2_paired.fastq trim/qc/{}_2_unpaired.fastq \
ILLUMINACLIP:$MINICONDA/envs/salmon/share/trimmomatic/adapters/TruSeq3-PE-2.fa:2:30:10:2:TRUE \
LEADING:20 TRAILING:20 \
SLIDINGWINDOW:4:15 \
MINLEN:36 &
2.2.3 salmon
make directory
mkdri output
mkdir references
if there is no index of salmon, you need build index with salmon, the test data can be found at /share/projects/Analytics/IO/caiyi/iop229-rnaseqnewtargettumor/data/references/MmvM33_samlon_ind
salmon index -t references.fa.gz -i references/ref_salmon_ind
Quantifying the samples
nohup cat filenames.txt | parallel --resume-failed --tmpdir trim/salmon/tmpdir -j 3 --results logs/salmon_map_PE/ --joblog logs/salmon_map_PE.log \
salmon quant -i references/ref_salmon_ind -l IU \
-1 trim/qc/{}_1_paired.fastq \
-2 trim/qc/{}_2_paired.fastq \
-p 8 \
--validateMappings \
--recoverOrphans \
--allowDovetail \
--useVBOpt \
--seqBias \
-o output/salmon/{}_quant &
The test salmon data can be found at /share/work/HPC/work_tmp/PipelineJob_141934_20230925/trim/salmon/*_quant
2.3 differential expression analysis
make directory
cd output
mkdir DA
The differential expression analysis were performed in R. once in R, set your working directory setwd('~/workspace/output/DA')
The metadata
of samples contain group information
library(tidyverse)
library(here)
library(tximport)
library(GenomicFeatures)
library(DESeq2)
library(RColorBrewer)
library(gplots)
library(clusterProfiler) # need latest version 3.14.3
library(enrichplot)
options(stringsAsFactors = FALSE)
# ref.annotation.gtf can be found at /share/projects/Analytics/IO/caiyi/iop229-rnaseqnewtargettumor/data/references/gencode.vM33.primary_assembly.annotation.gtf
txdb <- makeTxDbFromGFF('~/workspace/references/ref.annotation.gtf')
k <- keys(txdb, keytype = "TXNAME")
tx2gene <- select(txdb, keys = k, keytype = "TXNAME", columns = "GENEID")
head(tx2gene)
samples <- read.table('~/workspace/filenames.txt', header = FALSE)
files <- file.path('~/workspace/output/salmon', paste0(samples$V1, "_quant"), "quant.sf")
names(files) <- paste0(samples$V1)
txi.salmon <- tximport(files, type = "salmon", tx2gene = tx2gene)
head(txi.salmon$counts)
metadata <- data.frame(
SeqID=samples$V1,
Group=c(
rep('M43_PD1',3),
rep('M15_PD1',3),
rep('PD1',3),
rep('CTRL',3),
rep('R_PD1',3)
),
SampleID=c(
'M010-T1 1_1',
'M010-T1 1_4',
'M010-T1 1_6',
'M010-T1 1_8',
'M010-T1 1_10',
'M010-T2 2_1',
'M010-T2 2_9',
'M010-T8 8_1',
'M010-T8 8_4',
'M010-T8 8_10',
'M010-T8 8_2',
'M010-T8 8_7',
'M010-T2 2_7',
'M010-T2 2_5',
'M010-T2 2_2'
)
)
dds <- DESeqDataSetFromTximport(txi.salmon, metadata, ~Group)
dds <- DESeq(dds)
# Then, we create a sample distance heatmap and plot a PCA:
rld <- rlogTransformation(dds)
head(assay(rld))
z <- DESeq2::plotPCA(rld, intgroup="Group")
zz <- DESeq2::plotPCA(rld, returnData = TRUE, intgroup="Group")
zz <- merge(zz, metadata, by.x="name", by.y='SeqID')
z + ggrepel::geom_label_repel(data = zz, aes(x=PC1, y=PC2, label=SampleID))
(mycols <- brewer.pal(8, "Dark2")[1:length(unique(metadata$Group))])
sampleDists <- as.matrix(dist(t(assay(rld))))
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
col=colorpanel(100, "black", "white"),
ColSideColors=mycols[metadata$Group],
RowSideColors=mycols[metadata$Group],
margin=c(10, 10), main="Sample Distance Matrix")
# Volcano plot, CTRL vs PD1
res <- results(dds, contrast = c("Group", "CTRL", "PD1"))
summary(res)
table(res$padj<0.05)
res <- res[order(res$padj), ]
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
head(resdata)
resdata <- resdata %>%
mutate(stableGeneID=str_split_fixed(Gene, '\\.', 2)[,1])
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-2.5,2)))
with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
# GO
organism <- 'org.Mm.eg.db'
original_gene_list <- resdata$log2FoldChange
names(original_gene_list) <- resdata$Gene
gene_list <- na.omit(original_gene_list)
gene_list <- sort(gene_list, decreasing = TRUE)
gse <- gseGO(geneList=gene_list,
ont ="ALL",
keyType = "ENSEMBL",
nPerm = 10000,
minGSSize = 3,
maxGSSize = 800,
pvalueCutoff = 0.05,
verbose = TRUE,
OrgDb = organism,
pAdjustMethod = "none")
require(DOSE)
dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
gseaplot(gse, by = "all", title = gse$Description[1], geneSetID = 1)
# KEGG pathway
ids <- bitr(names(original_gene_list), fromType = "ENSEMBL", toType = "ENTREZID", OrgDb=organism)
dedup_ids <- ids[!duplicated(ids[c("ENSEMBL")]),]
resdata2 <- resdata[resdata$Gene %in% dedup_ids$ENSEMBL,]
resdata2$Y <- dedup_ids$ENTREZID
kegg_gene_list <- resdata2$log2FoldChange
names(kegg_gene_list) <- resdata2$Y
kegg_gene_list <- na.omit(kegg_gene_list)
kegg_gene_list <- sort(kegg_gene_list, decreasing = TRUE)
library(R.utils)
R.utils::setOption('clusterProfiler.download.method', 'wget')
getOption('clusterProfiler.download.method')
kegg_organism <- "mmu"
kk2 <- gseKEGG(geneList = kegg_gene_list,
organism = kegg_organism,
nPerm = 10000,
minGSSize = 3,
maxGSSize = 800,
pvalueCutoff = 0.05,
pAdjustMethod = "none",
keyType = "ncbi-geneid")
dotplot(kk2, showCategory = 10, title = "Enriched Pathways" , split=".sign") + facet_grid(.~.sign)