## ----rmdsetup, include = FALSE------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, eval=TRUE, echo=TRUE, cache=FALSE---------------------------------
library(seventyGeneData)

## ----downloadVantVeer, eval=FALSE, echo=TRUE, cache=FALSE---------------------
# ### Create a working directory
# dir.create("../extdata/vantVeer", showWarnings = FALSE, recursive = TRUE)
# ### Create the url list for all supplementary data on the Nature Website
# nkiUrl <- "http://bioinformatics.nki.nl/data/van-t-Veer_Nature_2002/"
# natureUrl <- "http://www.nature.com/nature/journal/v415/n6871/extref/"
# urlList <- c(
#   paste(nkiUrl,
#     sep = "",
#     c(
#       "ArrayData_greater_than_5yr.zip",
#       "ArrayData_less_than_5yr.zip", "ArrayData_19samples.zip",
#       "ArrayData_BRCA1.zip", "ArrayNomenclature_contig_accession.xls",
#       "ArrayNomenclature_methods.doc", "ProbeSeq.xls",
#       "README-Nature_I.doc", "codeboek_Rosetta.doc"
#     )
#   ),
#   paste(natureUrl,
#     sep = "",
#     c(
#       "415530a-s7.doc", "415530a-s8.xls",
#       "415530a-s9.xls", "415530a-s10.xls", "415530a-s11.xls"
#     )
#   )
# )
# ### Dowload all files from Nature and NKI
# lapply(urlList, function(x) {
#   download.file(x,
#     destfile = paste("../extdata/vantVeer/", gsub(".+/", "", x), sep = ""),
#     quiet = FALSE, mode = "w", cacheOK = TRUE
#   )
# })

## ----downloadVanDeVijver, eval=FALSE, echo=TRUE, cache=FALSE------------------
# ### Create a working directory
# dir.create("../extdata/vanDeVijver", showWarnings = FALSE, recursive = TRUE)
# ### Create the url list for all supplementary data on the NKI Website
# nkiUrl <- "http://bioinformatics.nki.nl/data/"
# urlList <- paste(nkiUrl, sep = "", c("nejm_table1.zip", "ZipFiles295Samples.zip"))
# ### Dowload all files from NKI
# lapply(urlList, function(x) {
#   download.file(x,
#     destfile = paste("../extdata/vanDeVijver/", gsub(".+/", "", x), sep = ""),
#     quiet = FALSE, mode = "w", cacheOK = TRUE
#   )
# })

## ----getPackagesBioc, eval=FALSE, echo=TRUE, cache=FALSE----------------------
# ### Get the list of available packages
# installedPckgs <- installed.packages()[, "Package"]
# ### Define the list of desired libraries
# pckgListBIOC <- c("Biobase", "limma", "breastCancerNKI", "readxl")
# ### Use the BiocManager package from Bioconductor
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
#   install.packages("BiocManager")
# }
# ### Load the packages, install them from Bioconductor if needed
# for (pckg in pckgListBIOC) {
#   if (!pckg %in% installedPckgs) BiocManager::install(pckg)
#   require(pckg, character.only = TRUE)
# }

## ----assembleAnnotation, eval=FALSE, echo=TRUE, cache=FALSE-------------------
# ### Load the library with annotation
# library(Biobase)
# library(breastCancerNKI)
# ### Load the dataset
# data(nki)
# ### Check dataset classes and attributes
# class(nki)
# dim(nki)
# ### Check featureData
# str(featureData(nki))
# nkiAnn <- featureData(nki)
# ### Turn all annotation information into character
# nkiAnn@data <- as.data.frame(apply(nkiAnn@data, 2, as.character),
#   stringsAsFactors = FALSE
# )

## ----assembleAnnotation2, eval=FALSE, echo=TRUE, cache=FALSE------------------
# ### Load the library
# library(readxl)
# ### Read GBACC information for van't Veer dataset
# myFile <- system.file("extdata/vantVeer", "ArrayNomenclature_contig_accession.xls",
#   package = "seventyGeneData"
# )
# featAcc <- read_xls(myFile)
# ### Read seq information for van't Veer dataset
# myFile <- system.file("extdata/vantVeer", "ProbeSeq.xls",
#   package = "seventyGeneData"
# )
# featSeq <- read_xls(myFile)
# ### Read 70-genes signature information for van't Veer dataset
# myFile <- system.file("extdata/vantVeer", "415530a-s9.xls",
#   package = "seventyGeneData"
# )
# gns231 <- read_xls(myFile)
# ### Remove special characters in the colums header,
# ### which are due to white spaces present in the Excel files
# colnames(gns231) <- gsub("\\s|#", "", colnames(gns231))
# ### Remove GO annotation
# gns231 <- gns231[, -grep("sp_xref_keyword_list", colnames(gns231))]
# ### Reorder the genes in decreasing order by absolute correlation
# gns231 <- gns231[order(abs(gns231$correlation), decreasing = TRUE), ]
# ### Select the feature identifiers corresponding to the top 231 and 70 genes
# gns231$genes231 <- TRUE
# gns231$genes70 <- gns231$accession %in% gns231$accession[1:70]
# ### Merge all information (including 70-gene signature information)
# ### with the annotation obtained from the breastCancerNKI package
# newAnn <- nkiAnn@data
# newAnn <- merge(newAnn, featAcc, by.x = 1, by.y = 1, all = TRUE, sort = FALSE)
# newAnn <- merge(newAnn, featSeq, by.x = 1, by.y = 1, all = TRUE, sort = FALSE)
# newAnn <- merge(newAnn, gns231, by.x = 1, by.y = 1, all = TRUE, sort = FALSE)

## ----assembleAnnotation3, eval=FALSE, echo=TRUE, cache=FALSE------------------
# ### Check the structure of the new annotation data.frame
# newAnn <- newAnn[order(newAnn[, 1]), ]
# str(newAnn)

## ----assembleVantVeer, eval=FALSE, echo=TRUE, cache=FALSE---------------------
# ### Load the library
# library(Biobase)
# library(readxl)
# ### Check presence of dowloaded file
# filesVtVloc <- system.file("extdata/vantVeer", package = "seventyGeneData")
# dir(filesVtVloc)
# ### Create list of files to be read in
# filesVtV <- dir(filesVtVloc, full.names = TRUE, pattern = "^ArrayData")
# filesVtV

## ----assembleVantVeer2, eval=FALSE, echo=TRUE, cache=FALSE--------------------
# myFile <- system.file("extdata/vantVeer", "415530a-s8.xls",
#   package = "seventyGeneData"
# )
# ### Read phenotypic information
# phenoVtV <- as.data.frame(read_xls(myFile))
# ### Show Phenotypic information
# str(phenoVtV)

## ----assembleVantVeer3, eval=FALSE, echo=TRUE, cache=FALSE--------------------
# ### Remove the special characters in the colums headers
# ### due to white spaces present in the Excel file
# colnames(phenoVtV) <- gsub("\\s|#", "", colnames(phenoVtV))
# #### Remove columns that do not contain useful information
# phenoVtV <- phenoVtV[, apply(phenoVtV, 2, function(x) length(unique(x)) > 1)]
# phenoVtV$SampleName <- paste("Sample", phenoVtV$Sample)
# rownames(phenoVtV) <- phenoVtV$SampleName
# ### Read sample names from the 6 expression data tables
# samplesVtV <- lapply(filesVtV, read.table,
#   nrow = 1, header = FALSE, sep = "\t",
#   stringsAsFactors = FALSE, fill = TRUE, strip.white = TRUE
# )
# ### Format the samples strings
# samplesVtV <- lapply(samplesVtV, function(x) x[grep("^Sample", x)])
# headerDesc <- samplesVtV
# samplesVtV <- lapply(samplesVtV, function(x) gsub(",.+", "", x))

## ----assembleVantVeer4, eval=FALSE, echo=TRUE, cache=FALSE--------------------
# ### Check sample lables obtained from expression data files
# str(samplesVtV)
# ### Combine the lables in one unique vector
# allSamplesVtV <- do.call("c", samplesVtV)
# ### Compare order the order the samples between the expression data
# ### and phenotypic information data.frames
# if (all(rownames(phenoVtV) %in% allSamplesVtV)) {
#   print("All sample names match phenoData")
#   if (all(rownames(phenoVtV) == allSamplesVtV)) {
#     print("All sample names match phenoData")
#   } else {
#     print("Sample names from tables and phenoData need reordering")
#     phenoVtV <- phenoVtV[order(phenoVtV$SampleName), ]
#   }
# } else {
#   print("Sample names DO NOT match phenoData")
# }

## ----assembleVantVeer5, eval=FALSE, echo=TRUE, cache=FALSE--------------------
# ### Read expression data from the 4 converted TAB-delimited text files
# dataVtV <- lapply(filesVtV, read.table,
#   skip = 1, sep = "\t", quote = "",
#   header = TRUE, row.names = NULL,
#   stringsAsFactors = FALSE, fill = FALSE, strip.white = FALSE
# )
# sapply(dataVtV, dim)
# ### Extract annotation: note that column headers are slightly different
# sapply(dataVtV, function(x) head(colnames(x)))
# sapply(dataVtV, function(x) tail(colnames(x)))
# ### Extract the associated annotation
# annVtV <- lapply(dataVtV, function(x) x[, c("Systematic.name", "Gene.name")])
# annVtV <- lapply(annVtV, function(x) {
#   x[x == ""] <- NA
#   x
# })
# annVtV <- do.call("cbind", annVtV)

## ----assembleVantVeer6, eval=FALSE, echo=TRUE, cache=FALSE--------------------
# ### Check annotation order in all data files
# if (all(apply(annVtV[, seq(1, 8, by = 2)], 1, function(x) length(unique(x)) == 1))) {
#   print("OK")
#   annVtV <- annVtV[, 1:2]
# } else {
#   print("Check annotation")
# }

## ----extractColumns, eval=FALSE, echo=TRUE, cache=FALSE-----------------------
# ### Define the function
# extractColumns <- function(x, pattern, ann) {
#   sel <- grep(pattern, colnames(x), value = TRUE)
#   x <- x[, sel]
#   rownames(x) <- ann
#   x <- x[order(rownames(x)), ]
# }

## ----assembleVantVeer7, eval=FALSE, echo=TRUE, cache=FALSE--------------------
# ### Extract log ratio data from all the spreadsheets
# logRat <- lapply(dataVtV, extractColumns, pattern = "Log10\\.ratio", ann = annVtV[, 1])
# logRat <- do.call("cbind", logRat)
# ### Assign colnames and reorder the columns
# colnames(logRat) <- allSamplesVtV
# logRat <- logRat[, order(colnames(logRat)), ]

## ----assembleVantVeer8, eval=FALSE, echo=TRUE, cache=FALSE--------------------
# ### Check order
# all(phenoVtV$SampleName == colnames(logRat))

## ----assembleVantVeer9, eval=FALSE, echo=TRUE, cache=FALSE--------------------
# ### Extract p-values from all the spreadsheets
# pVal <- lapply(dataVtV, extractColumns, pattern = "value", ann = annVtV[, 1])
# pVal <- do.call("cbind", pVal)
# ### Assign colnames and reorder the columns
# colnames(pVal) <- allSamplesVtV
# pVal <- pVal[, order(colnames(pVal)), ]

## ----assembleVantVeer10, eval=FALSE, echo=TRUE, cache=FALSE-------------------
# ### Check order
# all(phenoVtV$SampleName == colnames(pVal))

## ----assembleVantVeer11, eval=FALSE, echo=TRUE, cache=FALSE-------------------
# ### Extract expression intensity from all the spreadsheets
# intensity <- lapply(dataVtV, extractColumns, pattern = "Intensity", ann = annVtV[, 1])
# intensity <- do.call("cbind", intensity)
# ### Assign colnames and reorder the columns
# colnames(intensity) <- allSamplesVtV
# intensity <- intensity[, order(colnames(intensity)), ]

## ----assembleVantVeer12, eval=FALSE, echo=TRUE, cache=FALSE-------------------
# ### Check order
# all(phenoVtV$SampleName == colnames(intensity))

## ----assembleVantVeer13, eval=FALSE, echo=TRUE, cache=FALSE-------------------
# ### Merge annotation objects and check order
# annVtV <- merge(annVtV, newAnn, by = 1, all = TRUE, sort = TRUE)
# rownames(annVtV) <- annVtV[, 1]
# all(rownames(annVtV) == rownames(logRat))
# all(rownames(annVtV) == rownames(pVal))
# all(rownames(annVtV) == rownames(intensity))
# ### Create the new assayData
# myAssayData <- assayDataNew(exprs = logRat, pValue = pVal, intensity = intensity)
# ### Create the new phenoData
# myPhenoData <- new("AnnotatedDataFrame", phenoVtV)
# ### Create the new featureData
# myFeatureData <- new("AnnotatedDataFrame", annVtV)
# ### Create the new experimentData
# myExperimentData <- new("MIAME",
#   name = "Marc J Van De Vijver, Hongyue Dai, and Laura J van't Veer",
#   lab = "The Netherland Cancer Institute, Amsterdam, The Netherlands",
#   contact = "Luigi Marchionni <marchion@gmail.com>",
#   title = "Gene expression profiling predicts clinical outcome of breast cancer",
#   abstract = "Breast cancer patients with the same stage of disease can have markedly different treatment responses and overall outcome.
# The strongest predictors for metastases (for example, lymph node status and histological grade) fail to classify accurately breast tumours according to their clinical behaviour.
# Chemotherapy or hormonal therapy reduces the risk of distant metastases by approximately one-third; however, 70-80% of patients receiving this treatment would have survived without it.
# None of the signatures of breast cancer gene expression reported to date allow for patient-tailored therapy strategies.
# Here we used DNA microarray analysis on primary breast tumours of 117 young patients, and applied supervised classification to identify a gene expression signature strongly predictive of a short interval to distant metastases (`poor prognosis' signature) in patients without tumour cells in local lymph nodes at diagnosis (lymph node negative).
# In addition, we established a signature that identifies tumours of BRCA1 carriers. The poor prognosis signature consists of genes regulating cell cycle, invasion, metastasis and angiogenesis.
# This gene expression profile will outperform all currently used clinical parameters in predicting disease outcome. Our findings provide a strategy to select patients who would benefit from adjuvant therapy.",
#   url = "http://www.ncbi.nlm.nih.gov/pubmed/?term=11823860",
#   pubMedIds = "11823860"
# )
# ### Create the expression set
# vantVeer <- new("ExpressionSet",
#   assayData = myAssayData,
#   phenoData = myPhenoData,
#   featureData = myFeatureData,
#   experimentData = myExperimentData
# )

## ----assembelVanDeVijver, eval=FALSE, echo=TRUE, cache=FALSE------------------
# ##################################################
# ### Load the library
# library(Biobase)
# library(readxl)
# ##################################################
# ### Check presence of dowloaded files
# dir("../inst/extdata/vanDeVijver")

## ----assembelVanDeVijve1, eval=FALSE, echo=TRUE, cache=FALSE------------------
# ### Check presence of dowloaded file
# filesVdVloc <- system.file("extdata/vanDeVijver", package = "seventyGeneData")
# dir(filesVdVloc)
# ### Create list of files to be unzipped and read in
# filesVdVzip <- dir(filesVdVloc, full.names = TRUE)
# filesVdVzip
# ### Create output directory
# myTmpDir <- paste(filesVdVloc, "/tmp", sep = "")
# ### Decompress expression
# unzip(filesVdVzip[1], exdir = myTmpDir)
# ### Decompress phenoData
# unzip(filesVdVzip[2], exdir = myTmpDir)
# ### List of files in "ZipFiles295Samples.zip" containing expression
# filesVdV <- dir(myTmpDir, full.names = TRUE, pattern = "NKI")
# ### Show file list content
# filesVdV

## ----assembelVanDeVijver2, eval=FALSE, echo=TRUE, cache=FALSE-----------------
# ### Read phenotypic information
# myFile <- dir(myTmpDir, full.names = TRUE, pattern = "Table1_ClinicalData_Table.xls")
# phenoVdV <- as.data.frame(read_xls(myFile, skip = 3))
# #### Remove columns that do not contain useful information
# phenoVdV <- phenoVdV[, apply(phenoVdV, 2, function(x) length(unique(x)) > 1)]
# phenoVdV$SampleName <- paste("Sample", phenoVdV$SampleID)
# rownames(phenoVdV) <- phenoVdV$SampleName
# ### Read sample names from the expression data spreadsheets
# samplesVdV <- lapply(filesVdV, scan, what = "character", nlines = 1, sep = "\t", strip.white = FALSE)
# samplesVdV <- lapply(samplesVdV, function(x) x[x != ""])
# allSamplesVdV <- do.call("c", samplesVdV)
# ### Read all data contained in the expression data spreadsheets
# dataVdV <- lapply(filesVdV, read.table,
#   header = TRUE, skip = 1, sep = "\t", quote = "",
#   stringsAsFactors = FALSE, fill = TRUE, strip.white = TRUE
# )
# ### Extract feature annotation
# annVdV <- lapply(dataVdV, function(x) x[, c("Substance", "Gene")])
# annVdV <- lapply(annVdV, function(x) {
#   x[x == ""] <- NA
#   x
# })
# annVdV <- do.call("cbind", annVdV)

## ----assembelVanDeVijver2a, eval=FALSE, echo=TRUE, cache=FALSE----------------
# ### Check annotation order in all data files
# if (all(apply(annVdV[, seq(1, 12, by = 2)], 1, function(x) length(unique(x)) == 1))) {
#   print("OK")
#   annVdV <- annVdV[, 1:2]
# } else {
#   print("Check annotation")
# }

## ----extractColumns2, eval=FALSE, echo=TRUE, cache=FALSE----------------------
# ### Define the function
# extractColumns <- function(x, pattern, annVdV) {
#   colnames(x) <- gsub("Log\\.Ratio\\.Error", "Error", colnames(x))
#   sel <- grep(pattern, colnames(x), value = TRUE)
#   x <- x[, sel]
#   rownames(x) <- annVdV
#   x <- x[order(rownames(x)), ]
# }

## ----assembelVanDeVijver3, eval=FALSE, echo=TRUE, cache=FALSE-----------------
# ### Extract and assemble the log ratio values
# logRat <- lapply(dataVdV, extractColumns, pattern = "Log\\.Ratio", ann = annVdV[, 1])
# logRat <- do.call("cbind", logRat)
# ### Set the column names
# colnames(logRat) <- allSamplesVdV

## ----assembelVanDeVijver4, eval=FALSE, echo=TRUE, cache=FALSE-----------------
# ### Check order
# all(phenoVdV$SampleName == colnames(logRat))

## ----assembelVanDeVijver5, eval=FALSE, echo=TRUE, cache=FALSE-----------------
# ### Extract log ratio error
# logRatError <- lapply(dataVdV, extractColumns, pattern = "Error", ann = annVdV[, 1])
# logRatError <- do.call("cbind", logRatError)
# ### Set the column names
# colnames(logRatError) <- allSamplesVdV

## ----assembelVanDeVijver6, eval=FALSE, echo=TRUE, cache=FALSE-----------------
# ### Check order
# all(phenoVdV$SampleName == colnames(logRatError))

## ----assembelVanDeVijver7, eval=FALSE, echo=TRUE, cache=FALSE-----------------
# ### Extract P-value
# pVal <- lapply(dataVdV, extractColumns, pattern = "alue", ann = annVdV[, 1])
# pVal <- do.call("cbind", pVal)
# ### Set the column names
# colnames(pVal) <- allSamplesVdV

## ----assembelVanDeVijver8, eval=FALSE, echo=TRUE, cache=FALSE-----------------
# ### Check order
# all(phenoVdV$SampleName == colnames(pVal))

## ----assembelVanDeVijver9, eval=FALSE, echo=TRUE, cache=FALSE-----------------
# ### Extract Intensity
# intensity <- lapply(dataVdV, extractColumns, pattern = "Intensity", ann = annVdV[, 1])
# intensity <- do.call("cbind", intensity)
# ### Set the column names
# colnames(intensity) <- allSamplesVdV

## ----assembelVanDeVijver10, eval=FALSE, echo=TRUE, cache=FALSE----------------
# ### Check order
# all(phenoVdV$SampleName == colnames(intensity))

## ----assembelVanDeVijver11, eval=FALSE, echo=TRUE, cache=FALSE----------------
# ### Merge and check order
# annVdV <- merge(annVdV, newAnn, by = 1, all = TRUE, sort = TRUE)
# rownames(annVdV) <- annVdV[, 1]
# all(rownames(annVdV) == rownames(logRat))
# all(rownames(annVdV) == rownames(logRatError))
# all(rownames(annVdV) == rownames(pVal))
# all(rownames(annVdV) == rownames(intensity))
# ### Create the new assayData
# myAssayData <- assayDataNew(
#   exprs = logRat, exprsError = logRatError,
#   pValue = pVal, intensity = intensity
# )
# ### Create the new phenoData
# myPhenoData <- new("AnnotatedDataFrame", phenoVdV)
# ### Create the new featureData
# myFeatureData <- new("AnnotatedDataFrame", annVdV)
# ### Create the new experimentData
# myExperimentData <- new("MIAME",
#   name = "Marc J Van De Vijver, Yudong D He, and Laura J van't Veer",
#   lab = "The Netherland Cancer Institute, Amsterdam, The Netherlands",
#   contact = "Luigi Marchionni <marchion@gmail.com>",
#   title = "A gene-expresion signature as a predictor  of survival in breast cancer",
#   abstract = "Background: A more accurate means of prognostication in breast cancer will improve the selection of patients for adjuvant systemic therapy.
# Methods: Using microarray analysis to evaluate our previously established 70-gene prognosis profile, we classified a series of 295 consecutive patients with primary breast carcinomas as having a gene expression signature associated with either a poor prognosis or a good prognosis.
# All patients had stage I or II breast cancer and were younger than 53 years old; 151 had lymph-node-negative disease, and 144 had lymph-node-positive disease. We evaluated the predictive power of the prognosis profile using univariable and multivariable statistical analyses.
# Results: Among the 295 patients, 180 had a poor-prognosis signature and 115 had a good-prognosis signature, and the mean (+/-SE) overall 10-year survival rates were 54.6+/-4.4 percent and 94.5+/-2.6 percent, respectively.
# At 10 years, the probability of remaining free of distant metastases was 50.6+/-4.5 percent in the group with a poor-prognosis signature and 85.2+/-4.3 percent in the group with a good-prognosis signature.
# The estimated hazard ratio for distant metastases in the group with a poor-prognosis signature, as compared with the group with the good-prognosis signature, was 5.1 (95 percent confidence interval, 2.9 to 9.0; P<0.001).
# This ratio remained significant when the groups were analyzed according to lymph-node status. Multivariable Cox regression analysis showed that the prognosis profile was a strong independent factor in predicting disease outcome.
# Conclusions: The gene-expression profile we studied is a more powerful predictor of the outcome of disease in young patients with breast cancer than standard systems based on clinical and histologic criteria. (N Engl J Med 2002;347:1999-2009.)",
#   url = "http://www.ncbi.nlm.nih.gov/pubmed/?term=12490681",
#   pubMedIds = "12490681"
# )
# ### Create the expression set
# vanDeVijver <- new("ExpressionSet",
#   assayData = myAssayData,
#   phenoData = myPhenoData,
#   featureData = myFeatureData,
#   experimentData = myExperimentData
# )
# ### Remove temporary folder
# file.remove(dir(myTmpDir, full.names = TRUE))
# file.remove(myTmpDir)

## ----addSetInfo, eval=FALSE, echo=TRUE, cache=FALSE---------------------------
# ### Define the data set type from file of origin
# type <- gsub("..txt", "", gsub(".+ArrayData_", "", filesVtV))
# dataSetType <- mapply(x = samplesVtV, y = type, FUN = function(x, y) {
#   rep(y, length(x))
# })
# ### Combine with sample information
# dataSetType <- do.call("c", dataSetType)
# names(dataSetType) <- allSamplesVtV
# ### Reorder
# dataSetType <- dataSetType[order(names(dataSetType))]

## ----addSetInfo1, eval=FALSE, echo=TRUE, cache=FALSE--------------------------
# ### Add the information to pData(vantVeer)
# if (all(rownames(pData(vantVeer)) == names(dataSetType))) {
#   pData(vantVeer)$DataSetType <- dataSetType
#   print("Adding information about data set type to pData")
# } else {
#   print("Check order pData and data set type information")
# }

## ----ttmVentVeer, eval=FALSE, echo=TRUE, cache=FALSE--------------------------
# ### Process time metastases (TTM)
# pData(vantVeer)$TTM <- pData(vantVeer)$followup.time.yr
# #### Process TTM event
# pData(vantVeer)$TTMevent <- pData(vantVeer)$metastases
# #### Create binary TTM at 5 years groups
# pData(vantVeer)$FiveYearMetastasis <- pData(vantVeer)$TTM < 5 & pData(vantVeer)$TTMevent == 1
# ### Show structure of updated phenotypes
# str(pData(vantVeer))
# ### Save the final ExpressionSet object
# dataDirLoc <- system.file("data", package = "seventyGeneData")
# save(vantVeer, file = paste(dataDirLoc, "/vantVeer.rda", sep = ""))

## ----ttmVanDeVijver, eval=FALSE, echo=TRUE, cache=FALSE-----------------------
# ### Select  new cases not included in the van't Veer study
# pVDV <- pData(vanDeVijver)
# ### Rename columns
# selNames <- c("TIMEmeta", "EVENTmeta", "TIMEsurvival", "EVENTdeath", "TIMErecurrence")
# newNames <- c("TTM", "TTMevent", "OS", "OSevent", "RFS")
# colnames(pVDV)[sapply(selNames, grep, colnames(pVDV))] <- newNames
# ### Process time metastases (TTM)
# pVDV$TTM[is.nan(pVDV$TTM)] <- pVDV$OS[is.nan(pVDV$TTM)]
# ### Process recurrence free survival (RFS) adding RFSevent
# pVDV$RFSevent <- pVDV$RFS < pVDV$OS
# ### Create binary TTM at 5 years groups selecting:
# ### 1) the cases with metastases as first event within 5 years
# badCases <- which(
#   pVDV$TTM <= pVDV$RFS ### Met is 1st recurrence
#   & pVDV$TTMevent == 1 ### Metastases occurred
#   & pVDV$TTM < 5 ### Recurrence within 5 years
# )
# ### 2) the cases disease free for at least 5 years
# goodCases <- which(
#   pVDV$TTM > 5 ### No metastasis before 5 years
#   & pVDV$RFS > 5 ### No recurrence before 5 years
#   & pVDV$TTMevent == 0 ### Metastases did notoccurred
# )

## ----ttmVanDeVijver2, eval=FALSE, echo=TRUE, cache=FALSE----------------------
# ### Check if there are duplicated cased present in both prognostic groups
# all(!goodCases %in% badCases)

## ----ttmVanDeVijver3, eval=FALSE, echo=TRUE, cache=FALSE----------------------
# ### Create groups by setting all cases to NA and then identifying bad cases
# pVDV$FiveYearMetastasis <- NA
# pVDV$FiveYearMetastasis[badCases] <- TRUE
# ### And then excluding patients with a relapse before a metastasis within 5 years
# pVDV$FiveYearMetastasis[goodCases] <- FALSE
# ### Assign updated phenotypic data
# pData(vanDeVijver) <- pVDV
# ### Show structure of updated phenotypes
# str(pData(vanDeVijver))
# ### Save the final ExpressionSet object
# dataDirLoc <- system.file("data", package = "seventyGeneData")
# save(vanDeVijver, file = paste(dataDirLoc, "/vanDeVijver.rda", sep = ""))

## ----A.sessioInfo, echo=TRUE, eval=TRUE, cache=FALSE--------------------------
sessionInfo()