%%bash
ssh yourname@serveraddress


%%bash
cd ~/TodosSantosTrinity2022


%%bash
ls


%%bash
conda create --name Trinity
conda activate Trinity
conda install -c bioconda trinity=2.8.5


%%bash
fastp \
-i library_name_1.fastq -I library_name_2.fastq \
-o trimmed_library_name_1.fastq -O trimmed_library_name_2.fastq \
--length_required 36 --html fastp.html


%%bash
fastp \
-i ref_rep1_1.fastq.gz  -I ref_rep1_2.fastq.gz \
-o trimmed_ref_rep1_1.fastq.gz  -O trimmed_ref_rep1_2.fastq.gz \
--length_required 36 --html ref_rep1_fastp.html

fastp \
-i ref_rep2_1.fastq.gz  -I ref_rep2_2.fastq.gz \
-o trimmed_ref_rep2_1.fastq.gz -O trimmed_ref_rep2_2.fastq.gz \
--length_required 36 --html ref_rep2_fastp.html

fastp \
-i ref_rep3_1.fastq.gz  -I ref_rep3_2.fastq.gz \
-o trimmed_ref_rep3_1.fastq.gz -O trimmed_ref_rep3_2.fastq.gz \
--length_required 36 --html ref_rep3_fastp.html

fastp \
-i brain_rep1_1.fastq.gz  -I brain_rep1_2.fastq.gz \
-o trimmed_brain_rep1_1.fastq.gz -O trimmed_brain_rep1_2.fastq.gz \
--length_required 36 --html brain_rep1_fastp.html

fastp \
-i brain_rep2_1.fastq.gz  -I brain_rep2_2.fastq.gz \
-o trimmed_brain_rep2_1.fastq.gz -O trimmed_brain_rep2_2.fastq.gz \
--length_required 36 --html brain_rep2_fastp.html

fastp \
-i brain_rep3_1.fastq.gz  -I brain_rep3_2.fastq.gz \
-o trimmed_brain_rep3_1.fastq.gz -O trimmed_brain_rep3_2.fastq.gz \
--length_required 36 --html brain_rep3_fastp.html


%%bash
cat trimmed*_1*gz >reads.ALL.left.fastq.gz
cat trimmed*_2*gz >reads.ALL.right.fastq.gz


%%bash
Trinity \
  --seqType fq \
  --left reads.ALL.left.fastq.gz \
  --right reads.ALL.right.fastq.gz \
  --CPU 4 \
  --max_memory 20G


%%bash
grep ">" trinity_out_dir/Trinity.fasta | wc -l
grep -v  ">" trinity_out_dir/Trinity.fasta | wc -m


total bases/number of transcripts


%%bash
rsem-prepare-reference --num-threads 4 --bowtie2 'transcripts.fasta' 'prefix'


%%bash
rsem-prepare-reference --num-threads 4 --bowtie2 trinity_out_dir/Trinity.fasta hsa


%%bash
rsem-calculate-expression \
--num-threads 4 --bowtie2 --paired-end --append-names \
'left_reads.fastq' 'right_reads.fastq' 'prefix' 'name'


rsem-calculate-expression \
--num-threads 4 --bowtie2 --paired-end --append-names \
trimmed_ref_rep1_1.fastq.gz trimmed_ref_rep1_2.fastq.gz hsa ref1
  
rsem-calculate-expression \
--num-threads 4 --bowtie2 --paired-end --append-names \
trimmed_ref_rep2_1.fastq.gz trimmed_ref_rep2_2.fastq.gz hsa ref2

rsem-calculate-expression \
--num-threads 4 --bowtie2 --paired-end --append-names \
trimmed_ref_rep3_1.fastq.gz trimmed_ref_rep3_2.fastq.gz hsa ref3

rsem-calculate-expression \
--num-threads 4 --bowtie2 --paired-end --append-names \
trimmed_brain_rep1_1.fastq.gz trimmed_brain_rep1_2.fastq.gz hsa brain1
  
rsem-calculate-expression \
--num-threads 4 --bowtie2 --paired-end --append-names \
trimmed_brain_rep2_1.fastq.gz trimmed_brain_rep2_2.fastq.gz hsa brain2 

rsem-calculate-expression \
--num-threads 4 --bowtie2 --paired-end --append-names \
trimmed_brain_rep3_1.fastq.gz trimmed_brain_rep3_2.fastq.gz hsa brain3


%%bash
bash align.sh


%%bash
less ref1.stat/ref1.cnt


%%bash
less ref1.genes.results


%%bash
R


%%R
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("tximport")
BiocManager::install("DESeq2")


%%R
library("tximport")
library("DESeq2")


%%R
# create a vector with file names
# the c() function combines elements into a vector
gene_counts = c("ref1.genes.results", "ref2.genes.results", "ref3.genes.results", "brain1.genes.results", "brain2.genes.results", "brain3.genes.results")

# create vector with sample names
sample_names = c("ref1", "ref2", "ref3", "brain1", "brain2", "brain3")

# link the sample names and file names by adding names to the gene_counts vector members
# this uses two R functions, names() and paste()
names(gene_counts) = sample_names


%%R
# import counts data
txi_rsem = tximport(gene_counts, type = "rsem",)

# change genes with an effective length of 0 to have an effective length of 1
# the $ operator is used to extract a subset of a data object
txi_rsem$length[txi_rsem$length == 0] = 1 

# create a sample table data frame (data frames are a common type of table in R)
sample_table = data.frame(condition = c("ref", "ref", "ref", "brain", "brain", "brain")) 

# see what the data frame looks like
sample_table

# add sample names to sample_table using the rownames() and colnames() functions
rownames(sample_table) = colnames(txi_rsem$counts) 

# see what the modified data frame looks like
sample_table

# create a DESeq dataset (commonly given the name dds)
dds = DESeqDataSetFromTximport(txi_rsem, colData=sample_table, design=~condition)


%%R
# run DESeq on dds dataset
dds = DESeq(dds) 

# create a results table comparing brain to ref using the DESeq results() function
res = results(dds, contrast=c("condition", "brain", "ref"))

# examine the first 6 rows of the results table
head(res)


%%R
# write DGE results to a csv file using the write.csv(data_to_write, file_name) function
write.csv(res, 'Brain_vs_Ref.csv')

# write normalized counts to a csv file
# we can get the counts using the DESeq counts() function and we'll specify that the data should be normalized
write.csv(counts(dds, normalized=TRUE), 'Counts_Table.csv')


%%R
# create a data frame with the counts data
cnts = counts(dds, normalized=TRUE)

# create a new data frame, cnts_avg, and poplulate with the counts data as a placeholder
cnts_avg = cnts[,1:2]

# replace column 1 with mean counts for ref
cnts_avg[,1] = rowMeans(cnts[,1:3])

# replace column 2 with mean counts for brain
cnts_avg[,2] = rowMeans(cnts[,4:6])

# optional: rename columns for accuracy
colnames(cnts_avg) = c("ref_mean","brain_mean")


%%R
# subset the results table to contain only significantly differentially expressed genes
# the as.data.frame() converts an object to a data frame
# the subset() function returns a subset of data that meets a certain condition
# recall that res is our DESeq results table generated in step 5.4
res_sig = as.data.frame(subset(res, padj < 0.05))

# get only the signicantly differentially expressed genes from the counts_avg table
# rownames(res_sig) will return only the data matching the row names (genes) in res_sig
# square brackets, [], can be used to extract certain elements from a data object
# data_object[rows, columns], if either side of the the ',' is empty, all rows or columns will be returned
cnts_avg_sig = cnts_avg[rownames(res_sig),]


%%R
# intialize the plot output file with the pdf() function
pdf("plot.pdf")

# plot all data points using the generic plot() function
plot(log2(cnts_avg[,1]), log2(cnts_avg[,2]), main="brain vs ref", xlab="ref", ylab="brain", pch=16, col="grey80")

# overlay data points for which p < 0.05
points(log2(cnts_avg_sig[,1]), log2(cnts_avg_sig[,2]), pch=16, col="cyan2")

# add y = x line
abline(0,1, col="grey60")

# add y = 2x line
abline(1,1, col="grey60")

# add y = 0.5x line
abline(-1,1, col="grey60")

# close the plot output file
dev.off()

RNA-seq: Transcriptome Assembly and Differential Expression Analysis¶

OUTLINE¶

Part 1. Getting started¶

Part 2. Trim adapters and filter low quality reads¶

Part 3. De novo transcriptome assembly¶

Part 4. Estimate the number of reads aligning to each transcript in each library¶

Part 5. Identify differentially expressed genes¶