## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>"
)

## ----install_biocmanager,eval=FALSE-------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("curatedAdipoRNA")

## ----loading_libraries, message=FALSE-----------------------------------------
# loading required libraries
library(curatedAdipoRNA)
library(SummarizedExperiment)
library(S4Vectors)
library(fastqcr)
library(DESeq2)
library(dplyr)
library(tidyr)
library(ggplot2)

## ----loading_data-------------------------------------------------------------
# load data
data("adipo_counts")

# print object
adipo_counts

## ----adipo_counts-------------------------------------------------------------
# print count matrix
assay(adipo_counts)[1:5, 1:5]

## ----colData------------------------------------------------------------------
# names of the coldata object
names(colData(adipo_counts))

# table of times column
table(colData(adipo_counts)$time)

# table of stage column
table(colData(adipo_counts)$stage)

## ----rowRanges----------------------------------------------------------------
# print GRanges object
rowRanges(adipo_counts)

## ----qc-----------------------------------------------------------------------
# show qc data
adipo_counts$qc

# show the class of the first entry in qc
class(adipo_counts$qc[[1]][[1]])

## ----metadata, message=FALSE--------------------------------------------------
# print data of first study
metadata(adipo_counts)$studies[1,]

## ----summary_table,echo=FALSE-------------------------------------------------
# generate a study summary table
colData(adipo_counts) %>%
as.data.frame() %>%
  group_by(study_name) %>%
  summarise(pmid = unique(pmid),
            nsamples = n(),
            time = paste(unique(time), collapse = '/'),
            stages = paste(unique(stage), collapse = '/'),
            instrument_model = unique(instrument_model)) %>%
  knitr::kable(align = 'cccccc',
               col.names = c('GEO series ID', 'PubMed ID', 'Num. of Samples',
                             'Time (hr)', 'Differentiation Stage', 
                             'Instrument Model'))

## ----subset_object------------------------------------------------------------
# subsetting counts to 0 and 24 hours
se <- adipo_counts[, adipo_counts$time %in% c(0, 24)]

# showing the numbers of features, samples and time groups
dim(se)
table(se$time)

## ----filtering_samples--------------------------------------------------------
# filtering low quality samples
# chek the library layout
table(se$library_layout)

# check the number of files in qc
qc <- se$qc
table(lengths(qc))

# flattening qc list
qc <- unlist(qc, recursive = FALSE)
length(qc)

## ----per_base_scores----------------------------------------------------------
# extracting per_base_sequence_quality
per_base <- lapply(qc, function(x) {
  df <- x[['per_base_sequence_quality']]
  df %>%
    select(Base, Mean) %>%
    transform(Base = strsplit(as.character(Base), '-')) %>%
    unnest(Base) %>%
    mutate(Base = as.numeric(Base))
}) %>%
  bind_rows(.id = 'run')

## ----score_summary------------------------------------------------------------
# a quick look at quality scores
summary(per_base)

## ----finding_low_scores,fig.align='centre',fig.height=3,fig.width=7-----------
# find low quality runs
per_base <- per_base %>%
  group_by(run) %>%
  mutate(length = max(Base) > 150,
         run_average = mean(Mean) > 34)

# plot average per base quality
per_base %>%
  ggplot(aes(x = Base, y = Mean, group = run, color = run_average)) +
  geom_line() +
  facet_wrap(~length, scales = 'free_x')

## ----remove_lowscore----------------------------------------------------------
# get run ids of low quality samples
bad_samples <- data.frame(samples = unique(per_base$run[per_base$run_average == FALSE]))
bad_samples <- separate(bad_samples, col = samples, into = c('id', 'run'), sep = '\\.')

# subset the counts object
se2 <- se[, !se$id %in% bad_samples$id]
table(se2$time)

## ----remove low_counts--------------------------------------------------------
# filtering low count genes
low_counts <- apply(assay(se2), 1, function(x) length(x[x>10])>=2)
table(low_counts)

# subsetting the count object
se3 <- se2[low_counts,]

## ----differential expression--------------------------------------------------
# differential expression analysis
se3$time <- factor(se3$time)
dds <- DESeqDataSet(se3, ~time)
dds <- DESeq(dds)
res <- results(dds)
table(res$padj < .1)

## ----pca,fig.align='centre',fig.height=4,fig.width=4--------------------------
# explaining variabce 
plotPCA(rlog(dds), intgroup = 'time')
plotPCA(rlog(dds), intgroup = 'library_layout')
plotPCA(rlog(dds), intgroup = 'instrument_model')

## ----studies_keys-------------------------------------------------------------
# keys of the studies in this subset of the data
unique(se3$bibtexkey)

## ----citation, warning=FALSE--------------------------------------------------
# citing the package
citation("curatedAdipoRNA")

## ----session_info-------------------------------------------------------------
devtools::session_info()

