## ----style, echo=FALSE, results='asis', message=FALSE-------------------------
knitr::opts_chunk$set(
    tidy = FALSE,
    warning = FALSE,
    message = FALSE
)

library(yulab.utils)
Biocannopkg <- yulab.utils::Biocpkg

## ----echo=FALSE, results='hide', message=FALSE--------------------------------
library(GenomicFeatures)
library(GenomicRanges)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(BSgenome.Hsapiens.UCSC.hg38)
library(org.Hs.eg.db)
library(ggplot2)
library(clusterProfiler)
library(ReactomePA)
library(epiSeeker)

## ----load-data----------------------------------------------------------------
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
data(peakAnnoList)
# peakAnnoList <- lapply(files, annotateSeq, TxDb=txdb,
#                        tssRegion=c(-3000, 3000), verbose=FALSE)
peakAnno <- peakAnnoList[[4]]

## ----read-peak-file-----------------------------------------------------------
files <- getSampleFiles()
print(files)
peak <- readPeakFile(files[[4]])
peak

## ----cov-plot, fig.height=4, fig.width=10-------------------------------------
plotCov(peak, weightCol = "V5", chrs = c("chr17", "chr18"), xlim = c(4.5e7, 5e7), interactive = TRUE)

## ----eval=FALSE---------------------------------------------------------------
# # Details of the dataset and pre-processing we used here can be found from xxxx
# # We only show usage here for demonstration
# pancancer_atac <- readRDS("./data/pancancer_atac.rds")
# 
# fill_color <- c(
#     "#a10589", "#fce364", "#854d23", "#c0b742",
#     "#1c670d", "#3ec486", "#f7bbff", "#ee0700",
#     "#172ec5", "#f56103", "#3f3a95", "#7a7875",
#     "#e06cdc", "#7effff", "#f9b250", "#af1e18",
#     "#997f69", "#9d82ed", "#40bcf5", "#f28582",
#     "#97be8c", "#7b2ed0", "#81f75b"
# )
# 
# design <- "AAAB
#            AAAB
#            AAAB
#            CCC#"
# 
# atac_p <- plotCov(pancancer_atac,
#     weightCol = "V5",
#     chrs = "chr8", title = "",
#     xlim = c(126712193, 128412193),
#     xlab = "", fill_col = fill_color,
#     legend_position = "none", facet_var = ".id~.",
#     add_cluster_tree = TRUE,
#     add_coaccess = TRUE,
#     design = design
# )

## ----eval=FALSE---------------------------------------------------------------
# tssTagMatrix <- getTagMatrix(
#     peak = peak, upstream = 3000, downstream = 3000, weightCol = "V5",
#     type = "start_site", by = "transcript", TxDb = txdb
# )
# 
# # plot_prof = FALSE will plot only heatmap
# plotPeakHeatmap(tssTagMatrix,
#     plot_prof = TRUE,
#     statistic_method = "mean",
#     missingDataAsZero = TRUE,
#     conf = 0.95, resample = 1000
# )

## ----eval=FALSE---------------------------------------------------------------
# plotPeakProf(tssTagMatrix, conf = 0.95, resample = 1000)

## ----plot-bm-prof, fig.height=10, fig.width=15, fig.align="center"------------
data(demo_bmdata)
bmMatrix <- getBmMatrix(
    region = data.frame(chr = "chr22", start = 10525991, end = 10526342),
    BSgenome = BSgenome.Hsapiens.UCSC.hg38,
    input = demo_bmdata,
    base = "C",
    motif = c("CG")
)

# Interactive function is also supported by interactive parameter
plotBmProf(bmMatrix, interactive = TRUE)
# htmlwidgets::saveWidget(p, "base_modification.html")

## ----plot-gene-track, fig.height=3, fig.width=8, fig.align="center"-----------
# this region is the same as the region of pancancer figure
plotGeneTrack(
    txdb = txdb, chr = "chr8", start_pos = 126712193,
    end_pos = 128412193, OrgDb = "org.Hs.eg.db"
)

## ----plot-motif-prof, fig.height=5, fig.width=10, fig.align="center"----------
# motif reference of position-specific weight matrix
# opts_base <- list()
# opts_base[["collection"]] <- "CORE"
# opts_base[["all_versions"]] <- FALSE
# opts_human <- opts_base
# opts_human[["species"]] <- "Homo sapiens"
# opts_human[["tax_group"]] <- "vertebrates"
# sq24 <- RSQLite::dbConnect(RSQLite::SQLite(), JASPAR2024::db(JASPAR2024::JASPAR2024()))
# pwm_obj <- TFBSTools::getMatrixSet(sq24, opts_human)

data(pwm_obj)
motifMatrix <- getMotifMatrix(
    region = GRanges(
        seqnames = "chr22",
        ranges = IRanges(start = 10525991, end = 10526342)
    ),
    pwm = pwm_obj, ref_obj = BSgenome.Hsapiens.UCSC.hg38
)

# Interactive function is also supported by interactive parameter
plotMotifProf(motifMatrix, interactive = TRUE)

## ----annotate-seq, eval = FALSE-----------------------------------------------
# peakAnno <- annotateSeq(files[[4]],
#     tssRegion = c(-3000, 3000),
#     TxDb = txdb, annoDb = "org.Hs.eg.db"
# )

## ----annotate-seq-edb, eval = FALSE-------------------------------------------
# library(EnsDb.Hsapiens.v75)
# edb <- EnsDb.Hsapiens.v75
# seqlevelsStyle(edb) <- "UCSC"
# 
# peakAnno.edb <- annotateSeq(files[[4]],
#     tssRegion = c(-3000, 3000),
#     TxDb = edb, annoDb = "org.Hs.eg.db"
# )

## ----plot-anno-bar, fig.cap="Genomic Annotation by barplot", fig.align="center", fig.height=4, fig.width=10----
plotAnnoBar(peakAnno)

## ----plot-dist-to-tss, fig.cap="Distribution of Binding Sites", fig.align="center", fig.height=2, fig.width=6----
plotDistToTSS(peakAnno,
    title = "Distribution of transcription factor-binding loci\nrelative to TSS"
)

## ----enrichment-analysis, eval = FALSE----------------------------------------
# library(ReactomePA)
# 
# pathway1 <- enrichPathway(as.data.frame(peakAnno)$geneId)
# head(pathway1, 2)
# ##                          ID        Description GeneRatio   BgRatio RichFactor
# ## R-HSA-9830369 R-HSA-9830369 Kidney development    19/506  46/11214  0.4130435
# ## R-HSA-9758941 R-HSA-9758941       Gastrulation    27/506 115/11214  0.2347826
# ##               FoldEnrichment    zScore       pvalue     p.adjust       qvalue
# ## R-HSA-9830369       9.153892 12.045870 2.601506e-14 2.796619e-11 2.760335e-11
# ## R-HSA-9758941       5.203265  9.848629 8.375539e-13 4.501852e-10 4.443444e-10
# ##                                                                                                                                                   geneID
# ## R-HSA-9830369                                          2625/5076/7490/652/6495/2303/3975/6928/10736/5455/7849/3237/6092/2122/255743/2296/3400/28514/2138
# ## R-HSA-9758941 5453/5692/5076/5080/7546/3169/652/5015/2303/5717/3975/2627/5714/344022/7849/5077/2637/7022/8320/7545/6657/4487/51176/2296/28514/2626/64321
# ##               Count
# ## R-HSA-9830369    19
# ## R-HSA-9758941    27
# 
# gene <- seq2gene(peak, tssRegion = c(-1000, 1000), flankDistance = 3000, TxDb = txdb)
# pathway2 <- enrichPathway(gene)
# head(pathway2, 2)
# ##                          ID        Description GeneRatio   BgRatio RichFactor
# ## R-HSA-9830369 R-HSA-9830369 Kidney development    17/415  46/11214  0.3695652
# ## R-HSA-9758941 R-HSA-9758941       Gastrulation    24/415 115/11214  0.2086957
# ##               FoldEnrichment    zScore       pvalue     p.adjust       qvalue
# ## R-HSA-9830369       9.986276 11.971929 2.162576e-13 2.175552e-10 2.114772e-10
# ## R-HSA-9758941       5.639309  9.802874 3.528656e-12 1.774914e-09 1.725327e-09
# ##                                                                                                                                    geneID
# ## R-HSA-9830369                                      2625/5076/3227/652/6495/2303/3975/3237/6092/2122/255743/7490/6928/7849/5455/2296/28514
# ## R-HSA-9758941 5453/5692/5076/7546/3169/652/2303/5717/3975/2627/5714/344022/2637/8320/7545/7020/2626/5080/5015/7849/51176/2296/64321/28514
# ##               Count
# ## R-HSA-9830369    17
# ## R-HSA-9758941    24
# 
# dotplot(pathway2)

## ----plot-anno-bar-list, fig.cap="Genomic Annotation among different ChIPseq data", fig.align="center", fig.height=4, fig.width=6----
plotAnnoBar(peakAnnoList)

## ----compare-cluster, eval = FALSE--------------------------------------------
# genes <- lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
# names(genes) <- sub("_", "\n", names(genes))
# compKEGG <- compareCluster(
#     geneCluster = genes,
#     fun = "enrichKEGG",
#     pvalueCutoff = 0.05,
#     pAdjustMethod = "BH"
# )
# dotplot(compKEGG, showCategory = 15, title = "KEGG Pathway Enrichment Analysis")

## ----venn-plot, fig.cap="Overlap of annotated genes", fig.align="center", fig.height=7, fig.width=7----
genes <- lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
vennplot(genes)

## ----shuffle-peak-------------------------------------------------------------
p <- GRanges(
    seqnames = c("chr1", "chr3"),
    ranges = IRanges(start = c(1, 100), end = c(50, 130))
)
shuffle(p, TxDb = txdb)

## ----enrich-peak-overlap, eval = FALSE----------------------------------------
# enrichPeakOverlap(
#     queryPeak = files[[5]],
#     targetPeak = unlist(files[1:4]),
#     TxDb = txdb,
#     pAdjustMethod = "BH",
#     nShuffle = 50,
#     chainFile = NULL,
#     verbose = FALSE
# )
# ##                                                       qSample
# ## ARmo_0M    GSM1295077_CBX7_BF_ChipSeq_mergedReps_peaks.bed.gz
# ## ARmo_1nM   GSM1295077_CBX7_BF_ChipSeq_mergedReps_peaks.bed.gz
# ## ARmo_100nM GSM1295077_CBX7_BF_ChipSeq_mergedReps_peaks.bed.gz
# ## CBX6_BF    GSM1295077_CBX7_BF_ChipSeq_mergedReps_peaks.bed.gz
# ##                                                       tSample qLen tLen N_OL
# ## ARmo_0M                       GSM1174480_ARmo_0M_peaks.bed.gz 1663  812    0
# ## ARmo_1nM                     GSM1174481_ARmo_1nM_peaks.bed.gz 1663 2296    8
# ## ARmo_100nM                 GSM1174482_ARmo_100nM_peaks.bed.gz 1663 1359    3
# ## CBX6_BF    GSM1295076_CBX6_BF_ChipSeq_mergedReps_peaks.bed.gz 1663 1331  968
# ##                pvalue   p.adjust
# ## ARmo_0M    0.82352941 0.82352941
# ## ARmo_1nM   0.17647059 0.35294118
# ## ARmo_100nM 0.33333333 0.44444444
# ## CBX6_BF    0.01960784 0.07843137

## ----get-geo-species----------------------------------------------------------
getGEOspecies() |> head()

## ----get-geo-genome-----------------------------------------------------------
getGEOgenomeVersion() |> head()

## ----get-geo-info-------------------------------------------------------------
hg19 <- getGEOInfo(genome = "hg19", simplify = TRUE)
head(hg19)

## ----download-geo-bed, eval=FALSE---------------------------------------------
# downloadGEObedFiles(genome = "hg19", destDir = "hg19")

## ----download-gsm-bed, eval=FALSE---------------------------------------------
# gsm <- hg19$gsm[sample(nrow(hg19), 10)]
# downloadGSMbedFiles(gsm, destDir = "hg19")

## ----session-info, echo=FALSE-------------------------------------------------
sessionInfo()

