## ----include = FALSE------------------------------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -------------------------------------------------------------------------------------------------
system.file("extdata/proteinGroups.txt", 
            package = "proDA", mustWork = TRUE)

## -------------------------------------------------------------------------------------------------
full_data <- read.delim(
    system.file("extdata/proteinGroups.txt", 
                package = "proDA", mustWork = TRUE),
    stringsAsFactors = FALSE
)

head(colnames(full_data))

## -------------------------------------------------------------------------------------------------
# I use a regular expression (regex) to select the intensity columns
intensity_colnames <- grep("^LFQ\\.intensity\\.", colnames(full_data), value=TRUE)

# Create matrix which only contains the intensity columns
data <- as.matrix(full_data[, intensity_colnames])
colnames(data) <- sub("^LFQ\\.intensity\\.", "", intensity_colnames)
# Code missing values explicitly as NA
data[data == 0] <- NA
# log transformation to account for mean-variance relation
data <- log2(data)
# Overview of data
data[1:7, 1:6]
# Set rownames after showing data, because they are so long
rownames(data) <- full_data$Protein.IDs

## -------------------------------------------------------------------------------------------------
annotation_df <- data.frame(
    Condition = sub("\\.\\d+", "", sub("^LFQ\\.intensity\\.", 
                                       "", intensity_colnames)),
    Replicate = as.numeric(sub("^LFQ\\.intensity\\.[[:alnum:]]+\\.", 
                               "", intensity_colnames)),
    stringsAsFactors = FALSE, row.names = colnames(data)
)
head(annotation_df)

## ----eval=FALSE, include=TRUE---------------------------------------------------------------------
# # Not Run
# library(proDA)
# fit <- proDA(data, design= annotation_df$Condition, col_data = annotation_df)
# test_diff(fit, contrast = CG1407 - S2R)
# # End Not Run

## ----include=FALSE--------------------------------------------------------------------------------
library(SummarizedExperiment)
library(MSnbase)

## -------------------------------------------------------------------------------------------------
library(SummarizedExperiment)
se <- SummarizedExperiment(SimpleList(LFQ=data), colData=annotation_df)
rowData(se) <- full_data[, c("Only.identified.by.site", 
                             "Reverse", "Potential.contaminant")]
se

## -------------------------------------------------------------------------------------------------
library(MSnbase)

fData <- AnnotatedDataFrame(full_data[, c("Only.identified.by.site", 
                                 "Reverse", "Potential.contaminant")])
rownames(fData) <- rownames(data)
ms <- MSnSet(data, pData=AnnotatedDataFrame(annotation_df), fData=fData)
ms

## ----eval=FALSE, include=TRUE---------------------------------------------------------------------
# # Not Run
# library(proDA)
# fit <- proDA(se, design = ~ Condition - 1)
# test_diff(fit, contrast = ConditionCG1407 - ConditionS2R)
# # End Not Run

## ----include=FALSE--------------------------------------------------------------------------------
library(dplyr)
library(stringr)
library(readr)
library(tidyr)
library(tibble)

## -------------------------------------------------------------------------------------------------
library(dplyr)
library(stringr)
library(readr)
library(tidyr)
library(tibble)
# Or short 
# library(tidyverse)

## -------------------------------------------------------------------------------------------------
# The read_tsv function works faster and more reliable than read.delim
# But it sometimes needs help to identify the right type for each column,
# because it looks only at the first 1,000 elements. 
# Here, I explicitly define the `Reverse` column as a character column
full_data <- read_tsv(
    system.file("extdata/proteinGroups.txt", 
                package = "proDA", mustWork = TRUE),
    col_types = cols(Reverse = col_character())
)

full_data

## -------------------------------------------------------------------------------------------------
# I explicitly call `dplyr::select()` because there is a naming conflict
# between the tidyverse and BioConductor packages for `select()` function
tidy_data <- full_data %>%
    dplyr::select(ProteinID=Protein.IDs, starts_with("LFQ.intensity.")) %>%
    gather(Sample, Intensity, starts_with("LFQ.intensity.")) %>%
    mutate(Condition = str_match(Sample, 
                 "LFQ\\.intensity\\.([[:alnum:]]+)\\.\\d+")[,2]) %>%
    mutate(Replicate = as.numeric(str_match(Sample, 
                 "LFQ\\.intensity\\.[[:alnum:]]+\\.(\\d+)")[,2])) %>%
    mutate(SampleName = paste0(Condition, ".", Replicate))

tidy_data

## -------------------------------------------------------------------------------------------------
data <- tidy_data %>%
    mutate(Intensity = ifelse(Intensity == 0, NA, log2(Intensity))) %>%
    dplyr::select(ProteinID, SampleName, Intensity) %>%
    spread(SampleName, Intensity) %>%
    column_to_rownames("ProteinID") %>%
    as.matrix()

data[1:4, 1:7]

annotation_df <- tidy_data %>%
    dplyr::select(SampleName, Condition, Replicate) %>%
    distinct() %>%
    arrange(Condition, Replicate) %>%
    as.data.frame() %>%
    column_to_rownames("SampleName")

annotation_df

## -------------------------------------------------------------------------------------------------
library(SummarizedExperiment)
se <- SummarizedExperiment(SimpleList(LFQ=data), colData=annotation_df)
rowData(se) <- full_data[, c("Only.identified.by.site", 
                             "Reverse", "Potential.contaminant")]
se

## -------------------------------------------------------------------------------------------------
library(MSnbase)

fData <- AnnotatedDataFrame(full_data[, c("Only.identified.by.site", 
                                 "Reverse", "Potential.contaminant")])
rownames(fData) <- rownames(data)
ms <- MSnSet(data, pData=AnnotatedDataFrame(annotation_df), fData=fData)
ms

## ----eval=FALSE, include=TRUE---------------------------------------------------------------------
# # Not Run
# library(proDA)
# fit <- proDA(se, design = ~ Condition - 1)
# test_diff(fit, contrast = ConditionCG1407 - ConditionS2R)
# # End Not Run

## -------------------------------------------------------------------------------------------------
sessionInfo()

