Chapter 2 Pipeline

2.1 software install

There are some required software that you can install with conda

conda create -n salmon salmon
conda activate salmon
conda install trimmomatic
conda install multiqc

2.2 from fastq files to alignments

2.2.1 Quality control of FASTQ files

make directory

cd workspace
mkdir datasets

go to workspace/, fastq files is in datasets/, the test data can be found at /share/work/HPC/work_tmp/PipelineJob_141934_20230925/data

mkdir fastqc
for filename in datasets/*.fastq.gz
do
    fastqc -o fastqc $filename
done
multiqc fastqc/*

go to fastqc, open multiqc_report.html in browser, check whether the data is normal

2.2.2 Trimming

make directory

mkdir trim
cd trim
mkdir qc
cd ../../

make filenames list, each row represent a sample, the test data can be found at /share/work/HPC/work_tmp/PipelineJob_141934_20230925/data/raw_filenames.txt

head filenames.txt
sample01
sample02
sample03

trimming

cat filenames.txt | parallel -j 6 --results logs/trimmomatic/ --joblog logs/trimmomatic.log \
trimmomatic PE -threads 4 \
    -phred33 \
    -trimlog trim/qc/{}.log \
    datasets/{}_1.fastq.gz datasets/{}_2.fastq.gz \
    trim/qc/{}_1_paired.fastq trim/qc/{}_1_unpaired.fastq \
    trim/qc/{}_2_paired.fastq trim/qc/{}_2_unpaired.fastq \
    ILLUMINACLIP:$MINICONDA/envs/salmon/share/trimmomatic/adapters/TruSeq3-PE-2.fa:2:30:10:2:TRUE \
    LEADING:20 TRAILING:20 \
    SLIDINGWINDOW:4:15 \
    MINLEN:36 &

2.2.3 salmon

make directory

mkdri output
mkdir references

if there is no index of salmon, you need build index with salmon, the test data can be found at /share/projects/Analytics/IO/caiyi/iop229-rnaseqnewtargettumor/data/references/MmvM33_samlon_ind

salmon index -t references.fa.gz -i references/ref_salmon_ind

Quantifying the samples

nohup cat filenames.txt | parallel --resume-failed --tmpdir trim/salmon/tmpdir -j 3 --results logs/salmon_map_PE/ --joblog logs/salmon_map_PE.log \
salmon quant -i references/ref_salmon_ind -l IU \
    -1 trim/qc/{}_1_paired.fastq \
    -2 trim/qc/{}_2_paired.fastq \
    -p 8 \
    --validateMappings \
    --recoverOrphans \
    --allowDovetail \
    --useVBOpt \
    --seqBias \
    -o output/salmon/{}_quant &

The test salmon data can be found at /share/work/HPC/work_tmp/PipelineJob_141934_20230925/trim/salmon/*_quant

2.3 differential expression analysis

make directory

cd output
mkdir DA

The differential expression analysis were performed in R. once in R, set your working directory setwd('~/workspace/output/DA')

The metadata of samples contain group information

library(tidyverse)
library(here)
library(tximport)
library(GenomicFeatures)
library(DESeq2)
library(RColorBrewer)
library(gplots)
library(clusterProfiler) # need latest version 3.14.3
library(enrichplot)

options(stringsAsFactors = FALSE)

# ref.annotation.gtf can be found at /share/projects/Analytics/IO/caiyi/iop229-rnaseqnewtargettumor/data/references/gencode.vM33.primary_assembly.annotation.gtf
txdb <- makeTxDbFromGFF('~/workspace/references/ref.annotation.gtf')
k <- keys(txdb, keytype = "TXNAME")
tx2gene <- select(txdb, keys = k, keytype = "TXNAME", columns = "GENEID")
head(tx2gene)

samples <- read.table('~/workspace/filenames.txt', header = FALSE)
files <- file.path('~/workspace/output/salmon', paste0(samples$V1, "_quant"), "quant.sf")
names(files) <- paste0(samples$V1)
txi.salmon <- tximport(files, type = "salmon", tx2gene = tx2gene)
head(txi.salmon$counts)

metadata <- data.frame(
    SeqID=samples$V1,
    Group=c(
        rep('M43_PD1',3),
        rep('M15_PD1',3),
        rep('PD1',3),
        rep('CTRL',3),
        rep('R_PD1',3)
    ),
    SampleID=c(
               'M010-T1 1_1',
               'M010-T1 1_4',
               'M010-T1 1_6',
               'M010-T1 1_8',
               'M010-T1 1_10',
               'M010-T2 2_1',
               'M010-T2 2_9',
               'M010-T8 8_1',
               'M010-T8 8_4',
               'M010-T8 8_10',
               'M010-T8 8_2',
               'M010-T8 8_7',
               'M010-T2 2_7',
               'M010-T2 2_5',
               'M010-T2 2_2'
               )
)

dds <- DESeqDataSetFromTximport(txi.salmon, metadata, ~Group)
dds <- DESeq(dds)

# Then, we create a sample distance heatmap and plot a PCA:
rld <- rlogTransformation(dds)
head(assay(rld))
z <- DESeq2::plotPCA(rld, intgroup="Group")
zz <- DESeq2::plotPCA(rld, returnData = TRUE, intgroup="Group")
zz <- merge(zz, metadata, by.x="name", by.y='SeqID')
z + ggrepel::geom_label_repel(data = zz, aes(x=PC1, y=PC2, label=SampleID))

(mycols <- brewer.pal(8, "Dark2")[1:length(unique(metadata$Group))])
sampleDists <- as.matrix(dist(t(assay(rld))))
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
          col=colorpanel(100, "black", "white"),
          ColSideColors=mycols[metadata$Group],
          RowSideColors=mycols[metadata$Group],
          margin=c(10, 10), main="Sample Distance Matrix")

# Volcano plot, CTRL vs PD1
res <- results(dds, contrast = c("Group", "CTRL", "PD1"))
summary(res)
table(res$padj<0.05)
res <- res[order(res$padj), ]
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
head(resdata)
resdata <- resdata %>%
    mutate(stableGeneID=str_split_fixed(Gene, '\\.', 2)[,1])

with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-2.5,2)))
with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

# GO 
organism <- 'org.Mm.eg.db'
original_gene_list <- resdata$log2FoldChange
names(original_gene_list) <- resdata$Gene
gene_list <- na.omit(original_gene_list)
gene_list <- sort(gene_list, decreasing = TRUE)

gse <- gseGO(geneList=gene_list, 
             ont ="ALL", 
             keyType = "ENSEMBL", 
             nPerm = 10000, 
             minGSSize = 3, 
             maxGSSize = 800, 
             pvalueCutoff = 0.05, 
             verbose = TRUE, 
             OrgDb = organism, 
             pAdjustMethod = "none")

require(DOSE)
dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)

gseaplot(gse, by = "all", title = gse$Description[1], geneSetID = 1)

# KEGG pathway
ids <- bitr(names(original_gene_list), fromType = "ENSEMBL", toType = "ENTREZID", OrgDb=organism)
dedup_ids <- ids[!duplicated(ids[c("ENSEMBL")]),]
resdata2 <- resdata[resdata$Gene %in% dedup_ids$ENSEMBL,]
resdata2$Y <- dedup_ids$ENTREZID
kegg_gene_list <- resdata2$log2FoldChange
names(kegg_gene_list) <- resdata2$Y
kegg_gene_list <- na.omit(kegg_gene_list)
kegg_gene_list <- sort(kegg_gene_list, decreasing = TRUE)

library(R.utils)
R.utils::setOption('clusterProfiler.download.method', 'wget')
getOption('clusterProfiler.download.method')

kegg_organism <- "mmu"
kk2 <- gseKEGG(geneList     = kegg_gene_list,
               organism     = kegg_organism,
               nPerm        = 10000,
               minGSSize    = 3,
               maxGSSize    = 800,
               pvalueCutoff = 0.05,
               pAdjustMethod = "none",
               keyType       = "ncbi-geneid")
dotplot(kk2, showCategory = 10, title = "Enriched Pathways" , split=".sign") + facet_grid(.~.sign)