## ----fig.align='center', message=FALSE, warning=FALSE, eval=TRUE--------------
library(ttgsea)
library(fgsea)
data(examplePathways)
data(exampleRanks)
names(examplePathways) <- gsub("_", " ", substr(names(examplePathways), 9, 1000))

set.seed(1)
fgseaRes <- fgseaSimple(examplePathways, exampleRanks, nperm = 10000)
data.table::data.table(fgseaRes[order(fgseaRes$NES, decreasing = TRUE),])

### convert from gene set defined by BiocSet::BiocSet to list
#library(BiocSet)
#genesets <- BiocSet(examplePathways)
#gsc_list <- as(genesets, "list")

# convert from gene set defined by GSEABase::GeneSetCollection to list
#library(GSEABase)
#genesets <- BiocSet(examplePathways)
#gsc <- as(genesets, "GeneSetCollection")
#gsc_list <- list()
#for (i in 1:length(gsc)) {
#  gsc_list[[setName(gsc[[i]])]] <- geneIds(gsc[[i]])
#}

#set.seed(1)
#fgseaRes <- fgseaSimple(gsc_list, exampleRanks, nperm = 10000)

## ----fig.align='center', message=FALSE, warning=FALSE, eval=TRUE--------------
if (keras::is_keras_available() & reticulate::py_available()) {
  # model parameters
  num_tokens <- 1000
  length_seq <- 30
  batch_size <- 32
  embedding_dim <- 50
  num_units <- 32
  epochs <- 10
  
  # algorithm
  ttgseaRes <- fit_model(fgseaRes, "pathway", "NES",
                         model = bi_lstm(num_tokens, embedding_dim,
                                         length_seq, num_units),
                         num_tokens = num_tokens,
                         length_seq = length_seq,
                         epochs = epochs,
                         batch_size = batch_size,
                         use_generator = FALSE,
                         callbacks = keras::callback_early_stopping(
                            monitor = "loss",
                            patience = 10,
                            restore_best_weights = TRUE))
  
  # prediction for every token
  ttgseaRes$token_pred
  ttgseaRes$token_gsea[["TGF beta"]][,1:5]
}

## ----fig.align='center', message=FALSE, warning=FALSE, eval=TRUE--------------
if (exists("ttgseaRes")) {
  # prediction with MC p-value
  set.seed(1)
  new_text <- c("Cell Cycle DNA Replication",
                "Cell Cycle",
                "DNA Replication",
                "Cycle Cell",
                "Replication DNA",
                "TGF-beta receptor")
  print(predict_model(ttgseaRes, new_text))
  print(predict_model(ttgseaRes, "data science"))
}

## ----fig.align='center', message=FALSE, warning=FALSE, eval=TRUE--------------
if (exists("ttgseaRes")) {
  summary(ttgseaRes$model)
  plot_model(ttgseaRes$model)
}

## ----fig.align='center', message=FALSE, warning=FALSE, eval=FALSE-------------
# if (keras::is_keras_available() & reticulate::py_available()) {
#   # leading edge
#   LE <- unlist(lapply(fgseaRes$leadingEdge, function(x) gsub(",", "", toString(x))))
#   fgseaRes <- cbind(fgseaRes, LE)
# 
#   # model parameters
#   num_tokens <- 1000
#   length_seq <- 30
#   batch_size <- 32
#   embedding_dim <- 50
#   num_units <- 32
#   epochs <- 10
# 
#   # algorithm
#   ttgseaRes <- fit_model(fgseaRes, "LE", "NES",
#                          model = bi_lstm(num_tokens, embedding_dim,
#                                          length_seq, num_units),
#                          num_tokens = num_tokens,
#                          length_seq = length_seq,
#                          epochs = epochs,
#                          batch_size = batch_size,
#                          verbose = 0,
#                          callbacks = callback_early_stopping(
#                             monitor = "loss",
#                             patience = 5,
#                             restore_best_weights = TRUE))
# 
#   # prediction for every token
#   ttgseaRes$token_pred
# 
#   # prediction with MC p-value
#   set.seed(1)
#   new_text <- c("107995 56150", "16952")
#   predict_model(ttgseaRes, new_text)
# }

## ----fig.align='center', message=FALSE, warning=FALSE, eval=FALSE-------------
# if (keras::is_keras_available() & reticulate::py_available()) {
#   ## data preparation
#   library(airway)
#   data(airway)
# 
#   ## differentially expressed genes
#   library(DESeq2)
#   des <- DESeqDataSet(airway, design = ~ dex)
#   des <- DESeq(des)
#   res <- results(des)
#   head(res)
#   # log2FC used for GSEA
#   statistic <- res$"log2FoldChange"
#   names(statistic) <- rownames(res)
#   statistic <- na.omit(statistic)
#   head(statistic)
# 
#   ## gene set
#   library(org.Hs.eg.db)
#   library(BiocSet)
#   go <- go_sets(org.Hs.eg.db, "ENSEMBL", ontology = "BP")
#   go <- as(go, "list")
#   # convert GO id to term name
#   library(GO.db)
#   names(go) <- Term(GOTERM)[names(go)]
# 
#   ## GSEA
#   library(fgsea)
#   set.seed(1)
#   fgseaRes <- fgsea(go, statistic)
#   head(fgseaRes)
# 
#   ## tokenizing text of GSEA
#   # model parameters
#   num_tokens <- 5000
#   length_seq <- 30
#   batch_size <- 64
#   embedding_dim <- 128
#   num_units <- 32
#   epochs <- 20
#   # algorithm
#   ttgseaRes <- fit_model(fgseaRes, "pathway", "NES",
#                          model = bi_lstm(num_tokens, embedding_dim,
#                                          length_seq, num_units),
#                          num_tokens = num_tokens,
#                          length_seq = length_seq,
#                          epochs = epochs,
#                          batch_size = batch_size,
#                          callbacks = keras::callback_early_stopping(
#                            monitor = "loss",
#                            patience = 5,
#                            restore_best_weights = TRUE))
#   # prediction
#   ttgseaRes$token_pred
#   set.seed(1)
#   predict_model(ttgseaRes, c("translation response",
#                              "cytokine activity",
#                              "rhodopsin mediate",
#                              "granzyme",
#                              "histone deacetylation",
#                              "helper T cell",
#                              "Wnt"))
# }

## ----eval=TRUE----------------------------------------------------------------
sessionInfo()

