## ----eval = FALSE-------------------------------------------------------------
# if (!requireNamespace("BiocManager", quietly=TRUE))
#     install.packages("BiocManager")
# BiocManager::install(("TCGAbiolinksGUI.data")

## ----eval = FALSE-------------------------------------------------------------
# BiocManager::install("BioinformaticsFMRP/TCGAbiolinksGUI.data")

## ----eval = TRUE--------------------------------------------------------------
library(TCGAbiolinksGUI.data)

## ----eval=FALSE, include=TRUE-------------------------------------------------
# # Defining parameters
# getGDCdisease <- function(){
#   projects <- TCGAbiolinks:::getGDCprojects()
#   projects <- projects[projects$id != "FM-AD",]
#   disease <-  projects$project_id
#   idx <- grep("disease_type",colnames(projects))
#   names(disease) <-  paste0(projects[[idx]], " (",disease,")")
#   disease <- disease[sort(names(disease))]
#   return(disease)
# }

## -----------------------------------------------------------------------------
data(GDCdisease)
DT::datatable(as.data.frame(GDCdisease))

## ----eval=FALSE, include=TRUE-------------------------------------------------
# getMafTumors <- function(){
#   root <- "https://gdc-api.nci.nih.gov/data/"
#   maf <- fread("https://gdc-docs.nci.nih.gov/Data/Release_Notes/Manifests/GDC_open_MAFs_manifest.txt",
#                data.table = FALSE, verbose = FALSE, showProgress = FALSE)
#   tumor <- unlist(lapply(maf$filename, function(x){unlist(str_split(x,"\\."))[2]}))
#   proj <- TCGAbiolinks:::getGDCprojects()
# 
#   disease <-  gsub("TCGA-","",proj$project_id)
#   idx <- grep("disease_type",colnames(proj))
#   names(disease) <-  paste0(proj[[idx]], " (",proj$project_id,")")
#   disease <- sort(disease)
#   ret <- disease[disease %in% tumor]
#   return(ret)
# }

## -----------------------------------------------------------------------------
data(maf.tumor)
DT::datatable(as.data.frame(maf.tumor))

## ----eval = FALSE, include = TRUE---------------------------------------------
# library(readr)
# library(readxl)
# library(dplyr)
# library(caret)
# library(randomForest)
# library(doMC)
# library(e1071)
# 
# # Control random number generation
# set.seed(210) # set a seed to RNG
# 
# # register number of cores to be used for parallel evaluation
# registerDoMC(cores = parallel::detectCores())

## ----eval=FALSE, include=TRUE-------------------------------------------------
# file <- "https://tcga-data.nci.nih.gov/docs/publications/lgggbm_2016/LGG.GBM.meth.txt"
# if(!file.exists(basename(file))) downloader::download(file,basename(file))
# LGG.GBM <- as.data.frame(readr::read_tsv(basename(file)))
# rownames(LGG.GBM) <- LGG.GBM$Composite.Element.REF
# idx <- grep("TCGA",colnames(LGG.GBM))
# colnames(LGG.GBM)[idx] <- substr(colnames(LGG.GBM)[idx], 1, 12) # reduce complete barcode to sample identifier (first 12 characters)

## ----eval=FALSE, include=TRUE-------------------------------------------------
# file <- "http://www.cell.com/cms/attachment/2045372863/2056783242/mmc2.xlsx"
# if(!file.exists(basename(file))) downloader::download(file,basename(file))
# metadata <-  readxl::read_excel(basename(file), sheet = "S1A. TCGA discovery dataset", skip = 1)
# DT::datatable(
#   metadata[,c("Case",
#               "Pan-Glioma DNA Methylation Cluster",
#               "Supervised DNA Methylation Cluster",
#               "IDH-specific DNA Methylation Cluster")]
# )

## ----eval=FALSE, include=TRUE-------------------------------------------------
# file <- "http://zwdzwd.io/InfiniumAnnotation/current/EPIC/EPIC.manifest.hg38.rda"
# if(!file.exists(basename(file))) downloader::download(file,basename(file))
# load(basename(file))

## ----eval=FALSE, include=TRUE-------------------------------------------------
# file <- "https://tcga-data.nci.nih.gov/docs/publications/lgggbm_2015/PanGlioma_MethylationSignatures.xlsx"
# if(!file.exists(basename(file))) downloader::download(file,basename(file))

## ----eval=FALSE, include=TRUE-------------------------------------------------
# sheet <- "1,300 pan-glioma tumor specific"
# trainingset <- grep("mut|wt",unique(metadata$`Pan-Glioma DNA Methylation Cluster`),value = T)
# trainingcol <- c("Pan-Glioma DNA Methylation Cluster")

## ----eval=FALSE, include=TRUE-------------------------------------------------
# plat <- "EPIC"
# signature.probes <-  read_excel("PanGlioma_MethylationSignatures.xlsx",  sheet = sheet)  %>% pull(1)
# samples <- dplyr::filter(metadata, `IDH-specific DNA Methylation Cluster` %in% trainingset)
# RFtrain <- LGG.GBM[signature.probes, colnames(LGG.GBM) %in% as.character(samples$Case)] %>% na.omit

## ----eval=FALSE, include=TRUE-------------------------------------------------
# RFtrain <- RFtrain[!EPIC.manifest.hg38[rownames(RFtrain)]$MASK.general,]

## ----eval=FALSE, include=TRUE-------------------------------------------------
# trainingdata <- t(RFtrain)
# trainingdata <- merge(trainingdata, metadata[,c("Case", trainingcol[model])], by.x=0,by.y="Case", all.x=T)
# rownames(trainingdata) <- as.character(trainingdata$Row.names)
# trainingdata$Row.names <- NULL

## ----eval=FALSE, include=TRUE-------------------------------------------------
# nfeat <- ncol(trainingdata)
# trainingdata[,trainingcol] <-  factor(trainingdata[,trainingcol])
# mtryVals <- floor(sqrt(nfeat))
# for(i in floor(seq(sqrt(nfeat), nfeat/2, by = 2 * sqrt(nfeat)))) {
#   print(i)
#   x <- as.data.frame(
#     tuneRF(
#       trainingdata[,-grep(trainingcol[model],colnames(trainingdata))],
#       trainingdata[,trainingcol[model]],
#       stepFactor=2,
#       plot= FALSE,
#       mtryStart = i
#     )
#   )
#   mtryVals <- unique(c(mtryVals, x$mtry[which (x$OOBError == min(x$OOBError))]))
# }
# mtryGrid <- data.frame(.mtry = mtryVals)

## ----eval=FALSE, include=TRUE-------------------------------------------------
# fitControl <- trainControl(
#   method = "repeatedcv",
#   number = 10,
#   verboseIter = TRUE,
#   repeats = 10
# )

## ----eval=FALSE, include=TRUE-------------------------------------------------
# glioma.idh.model <- train(
#   y = trainingdata[,trainingcol], # variable to be trained on
#   x = trainingdata[,-grep(trainingcol,colnames(trainingdata))], # Daat labels
#   data = trainingdata, # Data we are using
#   method = "rf", # Method we are using
#   trControl = fitControl, # How we validate
#   ntree = 5000, # number of trees
#   importance = TRUE,
#   tuneGrid = mtryGrid, # set mtrys, the value that procuded a better model is used
# )

## -----------------------------------------------------------------------------
data(glioma.idh.model)
glioma.idh.model

## ----eval=FALSE, include=TRUE-------------------------------------------------
# sheet <- "1,308 IDHmutant tumor specific "
# trainingset <- grep("mut",unique(metadata$`IDH-specific DNA Methylation Cluster`),value = T)
# trainingcol <- "IDH-specific DNA Methylation Cluster"

## -----------------------------------------------------------------------------
data(glioma.idhmut.model)
glioma.idhmut.model

## ----eval=FALSE, include=TRUE-------------------------------------------------
# sheet <- "163  probes that define each TC"
# trainingset <- c("IDHmut-K1","IDHmut-K2")
# trainingcol <- "Supervised DNA Methylation Cluster"

## -----------------------------------------------------------------------------
data("glioma.gcimp.model")
glioma.gcimp.model

## ----eval=FALSE, include=TRUE-------------------------------------------------
# sheet <- "914 IDHwildtype tumor specific "
# trainingset <- grep("wt",unique(metadata$`IDH-specific DNA Methylation Cluster`),value = T))
# trainingcol <- "IDH-specific DNA Methylation Cluster"

## -----------------------------------------------------------------------------
data("glioma.idhwt.model")
glioma.idhwt.model

## -----------------------------------------------------------------------------
data("probes2rm")
head(probes2rm)

## ----eval = FALSE-------------------------------------------------------------
# scraplinks <- function(url){
#     # Create an html document from the url
#     webpage <- xml2::read_html(url)
#     # Extract the URLs
#     url_ <- webpage %>%
#         rvest::html_nodes("a") %>%
#         rvest::html_attr("href")
#     # Extract the link text
#     link_ <- webpage %>%
#         rvest::html_nodes("a") %>%
#         rvest::html_text()
#     tb <- tibble::tibble(link = link_, url = url_)
#     tb <- tb %>% dplyr::filter(tb$link == "Download")
#     return(tb)
# }
# 
# library(htmltab)
# library(dplyr)
# library(tidyr)
# root <- "http://linkedomics.org"
# root.download <- file.path(root,"data_download")
# linkedOmics <- htmltab(paste0(root,"/login.php#dataSource"))
# linkedOmics <- linkedOmics %>% unite(col = "download_page","Cohort Source","Cancer ID", remove = FALSE,sep = "-")
# linkedOmics.data <- plyr::adply(linkedOmics$download_page,1,function(project){
#     url <- file.path(root.download,project)
#     tryCatch({
#         tb <- cbind(tibble::tibble(project),htmltab(url),scraplinks(url))
#         tb$Link <- tb$link <- NULL
#         tb$dataset <- gsub(" \\(.*","",tb$`OMICS Dataset`)
#         tb
#     }, error = function(e) {
#         message(e)
#         return(NULL)
#     }
#     )
# },.progress = "time",.id = NULL)
# usethis::use_data(linkedOmics.data,internal = FALSE,compress = "xz")

## ----eval = FALSE-------------------------------------------------------------
# get_gene_information_biomart <- function(
#     genome = c("hg38","hg19"),
#     TSS = FALSE
# ){
#     requireNamespace("biomaRt")
#     genome <- match.arg(genome)
#     tries <- 0L
#     msg <- character()
#     while (tries < 3L) {
#         gene.location <- tryCatch({
#             host <- ifelse(
#                 genome == "hg19",
#                 "grch37.ensembl.org",
#                 "www.ensembl.org"
#             )
#             mirror <- list(NULL, "useast", "uswest", "asia")[[tries + 1]]
#             ensembl <- tryCatch({
#                 message(
#                     ifelse(
#                         is.null(mirror),
#                         paste0("Accessing ", host, " to get gene information"),
#                         paste0("Accessing ", host, " (mirror ", mirror, ")")
#                     )
#                 )
#                 biomaRt::useEnsembl(
#                     "ensembl",
#                     dataset = "hsapiens_gene_ensembl",
#                     host = host,
#                     mirror = mirror
#                 )
#             }, error = function(e) {
#                 message(e)
#                 return(NULL)
#             })
# 
#             # Column values we will recover from the database
#             attributes <- c(
#                 "ensembl_gene_id",
#                 "external_gene_name",
#                 "entrezgene",
#                 "chromosome_name",
#                 "strand",
#                 "end_position",
#                 "start_position",
#                 "gene_biotype"
#             )
# 
#             if (TSS)  attributes <- c(attributes, "transcription_start_site")
# 
#             db.datasets <- biomaRt::listDatasets(ensembl)
#             description <- db.datasets[db.datasets$dataset == "hsapiens_gene_ensembl", ]$description
#             message(paste0("Downloading genome information (try:", tries, ") Using: ", description))
#             gene.location <- biomaRt::getBM(
#                 attributes = attributes,
#                 filters = "chromosome_name",
#                 values = c(seq_len(22),"X","Y"),
#                 mart = ensembl
#             )
#             gene.location
#         }, error = function(e) {
#             msg <<- conditionMessage(e)
#             tries <<- tries + 1L
#             NULL
#         })
#         if (!is.null(gene.location)) break
#         if (tries == 3L) stop("failed to get URL after 3 tries:", "\n  error: ", msg)
#     }
# }
# gene.location.hg19 <- get_gene_information_biomart("hg19")
# save(gene.location.hg19, file = "gene.location.hg19.rda")
# 
# gene.location.hg38 <- get_gene_information_biomart("hg38")
# save(gene.location.hg38, file = "gene.location.hg38.rda")

## ----eval = FALSE-------------------------------------------------------------
# library(biomaRt)
# getTSS <- function(
#     genome = "hg38",
#     TSS = list(upstream = NULL, downstream = NULL)
# ) {
#     host <- ifelse(genome == "hg19",  "grch37.ensembl.org", "www.ensembl.org")
#     ensembl <- tryCatch({
#         useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", host =  host)
#     },  error = function(e) {
#         useEnsembl(
#           biomart = "ensembl",
#             dataset = "hsapiens_gene_ensembl",
#             host =  host
#         )
#     })
#     attributes <- c(
#         "chromosome_name",
#         "start_position",
#         "end_position",
#         "strand",
#         "transcript_start",
#         "transcription_start_site",
#         "transcript_end",
#         "ensembl_transcript_id",
#         "ensembl_gene_id",
#         "external_gene_name"
#     )
# 
#     chrom <- c(1:22, "X", "Y")
#     db.datasets <- listDatasets(ensembl)
#     description <- db.datasets[db.datasets$dataset == "hsapiens_gene_ensembl", ]$description
#     message(paste0("Downloading transcripts information. Using: ", description))
# 
#     tss <- getBM(
#         attributes = attributes,
#         filters = c("chromosome_name"),
#         values = list(chrom),
#         mart = ensembl
#     )
#     tss <- tss[!duplicated(tss$ensembl_transcript_id), ]
#     if (genome == "hg19") tss$external_gene_name <- tss$external_gene_id
#     tss$chromosome_name <-  paste0("chr", tss$chromosome_name)
#     tss$strand[tss$strand == 1] <- "+"
#     tss$strand[tss$strand == -1] <- "-"
# 
#     tss <- makeGRangesFromDataFrame(
#         tss,
#         start.field = "transcript_start",
#         end.field = "transcript_end",
#         keep.extra.columns = TRUE
#     )
# 
#     if (!is.null(TSS$upstream) & !is.null(TSS$downstream)) {
#         tss <- promoters(
#             tss,
#             upstream = TSS$upstream,
#             downstream = TSS$downstream
#         )
#     }
#     return(tss)
# }
# 
# gene.location.hg19.tss <- getTSS("hg19")
# save(gene.location.hg19.tss, file = "gene.location.hg19.tss.rda")
# 
# gene.location.hg38.tss <- getTSS("hg38")
# save(gene.location.hg38.tss, file = "gene.location.hg38.tss.rda")
# 

## ----sessionInfo--------------------------------------------------------------
sessionInfo()