## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(message = FALSE, 
                      warning = FALSE,
          					  tidy = FALSE, 
          					  fig.width = 6, 
          					  fig.height = 6,
          					  fig.align = "center")
# Handle the biofilecache error
library(BiocFileCache)
bfc <- BiocFileCache()
res <- bfcquery(bfc, 
                "annotationhub.index.rds", 
                field = "rname", 
                exact = TRUE) 
bfcremove(bfc, rids = res$rid)

## ----fig.cap = "A workflow for ChIP-seq data analysis", echo = FALSE, out.width = "100%", out.height = "100%"----
png <- system.file("extdata", "dataFlow.png", package = "ChIPpeakAnno")

knitr::include_graphics(png)

## -----------------------------------------------------------------------------
library(ChIPpeakAnno)

macs_peak <- system.file("extdata", "MACS_peaks.xls", 
                         package = "ChIPpeakAnno")
macs_peak_gr <- toGRanges(macs_peak, format = "MACS")
head(macs_peak_gr, n = 2)

## -----------------------------------------------------------------------------
# Use Ensembl annotation package:
library(EnsDb.Hsapiens.v86)

ensembl.hs86.transcript <- transcripts(EnsDb.Hsapiens.v86)

## -----------------------------------------------------------------------------
# Use Ensembl annotation package:
macs_peak_ensembl <- annotatePeakInBatch(macs_peak_gr, 
										 AnnotationData = ensembl.hs86.transcript)
head(macs_peak_ensembl, n = 2)

## -----------------------------------------------------------------------------
# Use Ensembl annotation package:
macs_peak_ensembl <- addGeneIDs(annotatedPeak = macs_peak_ensembl, 
                                orgAnn = EnsDb.Hsapiens.v86,
                                feature_id_type = "ensembl_transcript_id",
                                IDs2Add = "SYMBOL")
head(macs_peak_ensembl, n = 2)

## -----------------------------------------------------------------------------
# Convert GFF to GRanges:
macs_peak_gff <- system.file("extdata", "GFF_peaks_hg38.gff", 
                             package = "ChIPpeakAnno")
macs_peak_gr1 <- toGRanges(macs_peak_gff, format = "GFF", header = FALSE)
head(macs_peak_gr1, n = 2)

# Convert BED to GRanges:
macs_peak_bed <- system.file("extdata", "MACS_output_hg38.bed", 
                             package = "ChIPpeakAnno")
macs_peak_gr2 <- toGRanges(macs_peak_bed, format = "BED", header = FALSE)
head(macs_peak_gr2, n = 2)

## -----------------------------------------------------------------------------
# For two samples:
data(peaks1)
data(peaks2)
head(peaks1, n = 2)
head(peaks2, n = 2)
ol <- findOverlapsOfPeaks(peaks1, peaks2)
names(ol)

# For three samples:
data(peaks3)
ol2 <- findOverlapsOfPeaks(peaks1, peaks2, peaks3,
                           connectedPeaks = "min")

# Find peaks with at least 90% overlap:
ol3 <- findOverlapsOfPeaks(peaks1, peaks2, minoverlap = 0.9)

## -----------------------------------------------------------------------------
# For peaks1:
length(ol$peaklist[["peaks1"]])

# For peaks2:
length(ol$peaklist[["peaks2"]])

## ----results = "hide", fig.cap = "Venn diagram showing the number of overlapping peaks for two samples", fig.small = TRUE----
# For two samples:
venn <- makeVennDiagram(ol, totalTest = 100,
                        fill = c("#009E73", "#F0E442"),
                        col = c("#D55E00", "#0072B2"),
                        cat.col = c("#D55E00", "#0072B2"))

# For three samples:
venn2 <- makeVennDiagram(ol2, totalTest = 100,
                         fill = c("#CC79A7", "#56B4E9", "#F0E442"),
                         col = c("#D55E00", "#0072B2", "#E69F00"),
                         cat.col = c("#D55E00", "#0072B2", "#E69F00"))

# For peaks overlap at least 90%:
venn3 <- makeVennDiagram(ol3, totalTest = 100,
                         fill = c("#009E73", "#F0E442"),
                         col = c("#D55E00", "#0072B2"),
                         cat.col = c("#D55E00", "#0072B2"))

## ----eval = FALSE-------------------------------------------------------------
# if (!library(Vennerable)) {
#   install.packages("Vennerable", repos="http://R-Forge.R-project.org")
#   library(Vennerable)
# }
# 
# n <- which(colnames(ol$vennCounts) == "Counts") - 1
# set_names <- colnames(ol$vennCounts)[1:n]
# weight <- ol$vennCounts[, "Counts"]
# names(weight) <- apply(ol$vennCounts[, 1:n], 1, base::paste, collapse = "")
# v <- Venn(SetNames = set_names, Weight = weight)
# plot(v)

## -----------------------------------------------------------------------------
names(ol$overlappingPeaks)
dim(ol$overlappingPeaks[["peaks1///peaks2"]])
ol$overlappingPeaks[["peaks1///peaks2"]][1:2, ]
unique(ol$overlappingPeaks[["peaks1///peaks2"]][["overlapFeature"]])
pie1(table(ol$overlappingPeaks[["peaks1///peaks2"]]$overlapFeature))

## ----echo = FALSE-------------------------------------------------------------
library(knitr)

Resource <- c("Ensembl", "NCBI RefSeq")
Generated_by <- c("EMBL-EBI", "NCBI")
Annotation_criteria <- c("Comprehensive (most transcripts)", "Conservative (fewer transcripts)")
Gene_id_name <- c("Ensembl gene ID", "NCBI Gene ID, or entrezGene ID, or entrez ID")
URL <- c("https://ftp.Ensembl.org/pub/", "https://ftp.ncbi.nlm.nih.gov/refseq/")
df <- data.frame(Resource, Generated_by, Annotation_criteria, URL)

kable(df, caption = 'Commonly used annotation resources')

## ----echo = FALSE-------------------------------------------------------------
library(knitr)

Human <- c("[hg38.refGene.gtf.gz](https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.refGene.gtf.gz)",
           "[hg38.ncbiRefSeq.gtf.gz](https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.ncbiRefSeq.gtf.gz)",
           "[hg38.knownGene.gtf.gz](https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.knownGene.gtf.gz)")
Mouse <- c("[mm10.refGene.gtf.gz](https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/genes/mm10.refGene.gtf.gz)",
           "[mm10.ncbiRefSeq.gtf.gz](https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/genes/mm10.ncbiRefSeq.gtf.gz)",
           "[mm10.knownGene.gtf.gz](https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/genes/mm10.knownGene.gtf.gz)")
Remark <- c("Based on RefSeq transcripts aligned by UCSC followed by manual curation",
            "Based on RefSeq transcripts as aligned by NCBI",
            "Based on Ensembl gene models")
df <- data.frame(Human, Mouse, Remark)

kable(df, caption = "UCSC Genome Browser hosted annotation files")

## -----------------------------------------------------------------------------
library(AnnotationHub)

ah <- AnnotationHub()

# Obtain EnsDb:
EnsDb_Mmusculus_all <- query(ah, pattern = c("Mus musculus", "EnsDb"))
head(EnsDb_Mmusculus_all, n = 2)
EnsDb_Mmusculus <- EnsDb_Mmusculus_all[["AH53222"]]
class(EnsDb_Mmusculus)

# Obtain TxDb:
TxDb_Mmusculus_all <- query(ah, pattern = c("Mus musculus", "TxDb"))
head(TxDb_Mmusculus_all, n = 2)
TxDb_Mmusculus <- TxDb_Mmusculus_all[["AH52264"]]
class(TxDb_Mmusculus)

## ----eval = FALSE-------------------------------------------------------------
# library(biomaRt)
# 
# listMarts()
# head(listDatasets(useMart("ENSEMBL_MART_ENSEMBL")), n = 2)
# mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL",
#                 dataset = "mmusculus_gene_ensembl")
# anno_from_biomart <- getAnnotation(mart,
#                                    featureType = "transcript")
# head(anno_from_biomart, n = 2)

## -----------------------------------------------------------------------------
library(EnsDb.Hsapiens.v86)

anno_ensdb_transcript <- toGRanges(EnsDb.Hsapiens.v86, 
                                   feature = "transcript")
head(anno_ensdb_transcript, n = 2)

## -----------------------------------------------------------------------------
anno_ensdb_transcript <- transcripts(EnsDb.Hsapiens.v86)

head(anno_ensdb_transcript, n = 2)

## ----fig.cap = "Peak count distribution around TSSs", warning = FALSE---------
library(TxDb.Hsapiens.UCSC.hg38.knownGene)

annotation_data <- transcripts(TxDb.Hsapiens.UCSC.hg38.knownGene)
binOverFeature(macs_peak_gr2, 
               featureSite = "FeatureStart",
               nbins = 20,
               annotationData = annotation_data,
               xlab = "peak distance from TSS (bp)", 
               ylab = "peak count", 
               main = "Distribution of aggregated peak numbers around TSS")

## ----fig.cap = "Bar graph showing peak distributions over different genomic features"----
chromosome_region <- assignChromosomeRegion(macs_peak_gr2,
                                            TxDb = TxDb.Hsapiens.UCSC.hg38.knownGene,
                                            nucleotideLevel = FALSE,
                                            precedence=c("Promoters",
                                                         "immediateDownstream", 
                                                         "fiveUTRs", 
                                                         "threeUTRs",
                                                         "Exons", 
                                                         "Introns"))

# optional helper function to rotate x-axis labels for barplot(): 
# ref: https://stackoverflow.com/questions/10286473/rotating-x-axis-labels-in-r-for-barplot
rotate_x <- function(data, rot_angle) {
  plt <- barplot(data, xaxt = "n")
  text(plt, par("usr")[3], 
       labels = names(data), 
       srt = rot_angle, adj = c(1.1,1.1), 
       xpd = TRUE, cex = 0.6)
}

rotate_x(chromosome_region[["percentage"]], 45)

## ----fig.cap = "Pie graph showing peak distributions over features at different levels"----
genomicElementDistribution(macs_peak_gr1, 
                           TxDb = TxDb.Hsapiens.UCSC.hg38.knownGene)

## ----fig.cap = "Bar graph showing peak distributions over features at different levels"----
macs_peaks <- GRangesList(rep1 = macs_peak_gr1,
                          rep2 = macs_peak_gr2)
genomicElementDistribution(macs_peaks, 
                           TxDb = TxDb.Hsapiens.UCSC.hg38.knownGene)

## ----fig.cap = "UpSet plot showing the overlappings of peak distributions among different genomic features"----
library(UpSetR)

res <- genomicElementUpSetR(macs_peak_gr1,
                            TxDb.Hsapiens.UCSC.hg38.knownGene)
upset(res[["plotData"]], 
      nsets = length(colnames(res$plotData)), 
      nintersects = NA)

## ----fig.cap = "How does peak-centric annotation method work", echo = FALSE, out.width = "100%", out.height = "100%"----
png <- system.file("extdata", "annoPeakCentric.png", package = "ChIPpeakAnno")

knitr::include_graphics(png)

## ----fig.cap = "How does feature-centric annotation method work", echo = FALSE, out.width = "100%", out.height = "100%"----
png <- system.file("extdata", "annoFeatureCentric1.png", package = "ChIPpeakAnno")

knitr::include_graphics(png)

## ----fig.cap = "How to define target region with bindingRegion", echo = FALSE, out.width = "100%", out.height = "100%"----
png <- system.file("extdata", "annoFeatureCentric2.png", package = "ChIPpeakAnno")

knitr::include_graphics(png)

## -----------------------------------------------------------------------------
library(TxDb.Hsapiens.UCSC.hg18.knownGene)

data(myPeakList)
peak_to_anno <- myPeakList[1:100]
anno_data <- transcripts(TxDb.Hsapiens.UCSC.hg18.knownGene)

annotated_peak <- annotatePeakInBatch(peak_to_anno, 
                                      output = "nearestLocation",
                                      PeakLocForDistance = "start",
                                      FeatureLocForDistance = "TSS",
                                      AnnotationData = anno_data)
head(annotated_peak, n = 2)

## -----------------------------------------------------------------------------
annotated_peak <- annotatePeakInBatch(peak_to_anno, 
                                      AnnotationData = anno_data,
                                      output = "both",
                                      maxgap = 100)
head(annotated_peak, n = 4)

## ----fig.cap = "Peak distribution around features"----------------------------
pie1(table(annotated_peak$insideFeature))

## -----------------------------------------------------------------------------
TF_binding_sites <- GRanges(seqnames = c("1", "2", "3", "4", "5", "6", "1", 
                                         "2", "3", "4", "5", "6", "6", "6", 
                                         "6", "6", "5"),
                            ranges = IRanges(start = c(967659, 2010898, 2496700, 
                                                       3075866, 3123260, 3857500,
                                                       96765, 201089, 249670, 
                                                       307586, 312326, 385750, 
                                                       1549800, 1554400, 1565000, 
                                                       1569400, 167888600),
                                             end = c(967869, 2011108, 2496920, 
                                                     3076166,3123470, 3857780,
                                                     96985, 201299, 249890, 307796, 
                                                     312586, 385960, 1550599, 
                                                     1560799, 1565399, 1571199, 
                                                     167888999),
                                             names = paste("t", 1:17, sep = "")),
                            strand = c("*", "*", "*", "*", "*", "*", "*", "*", "*", 
                                       "*", "*", "*", "*", "*", "*", "*", "*"))

annotated_peak2 <- annotatePeakInBatch(peaks1, AnnotationData = TF_binding_sites)
head(annotated_peak2, n = 2)

## -----------------------------------------------------------------------------
promoter_regions <- promoters(TxDb.Hsapiens.UCSC.hg18.knownGene, 
                              upstream = 2000, downstream = 500)
head(promoter_regions, n = 2)

annotated_peak3 <- annotatePeakInBatch(peak_to_anno, 
                                       AnnotationData = promoter_regions)
head(annotated_peak3, n = 2)

## -----------------------------------------------------------------------------
anno_txdb <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
head(anno_txdb$gene_id, n = 5)

anno_ensdb <- genes(EnsDb.Hsapiens.v86)
head(anno_ensdb$gene_id, n = 5)

## -----------------------------------------------------------------------------
library(org.Hs.eg.db)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)

ucsc.hg38.knownGene <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
entrez_ids <- head(ucsc.hg38.knownGene$gene_id, n = 10)
print(entrez_ids)

res <- addGeneIDs(entrez_ids, 
                  orgAnn = "org.Hs.eg.db", 
                  feature_id_type = "entrez_id",
                  IDs2Add = "symbol")
head(res, n = 3)

## -----------------------------------------------------------------------------
txdb.hg38.transcript <- transcripts(TxDb.Hsapiens.UCSC.hg38.knownGene)
head(txdb.hg38.transcript, n = 4)

## -----------------------------------------------------------------------------
names(txdb.hg38.transcript) <- txdb.hg38.transcript$tx_name

## -----------------------------------------------------------------------------
res <- annotatePeakInBatch(macs_peak_gr2, 
                           AnnotationData = txdb.hg38.transcript)
head(res, n = 2)

## -----------------------------------------------------------------------------
tryCatch({
  # we will first try biomaRt
  library(biomaRt)

  mart <- useMart(biomart = "ensembl",
                  dataset = "hsapiens_gene_ensembl")
  res <- addGeneIDs(res, 
                    mart = mart,
                    feature_id_type = "ensembl_transcript_id",
                    IDs2Add = "hgnc_symbol")
  head(res, n = 2)
}, error=function(e){
  message(e)
  library(org.Hs.eg.db)
  res <- addGeneIDs(res, 
                    orgAnn = TxDb.Hsapiens.UCSC.hg38.knownGene,
                    feature_id_type = "ensembl_transcript_id",
                    IDs2Add = "GENEID")
  res$symbol[!is.na(res$GENEID)] <- 
    xget(res$GENEID[!is.na(res$GENEID)], org.Hs.egSYMBOL, output = 'first')
  tail(res, n = 2)
})

## ----fig.cap = "Histogram showing enriched GO terms"--------------------------
anno_data <- genes(EnsDb.Hsapiens.v86)
annotated_peak4 <- annotatePeakInBatch(macs_peak_gr2,
                                       AnnotationData = anno_data,
                                       output = "both")

enriched_go <- getEnrichedGO(annotated_peak4, 
                             orgAnn = "org.Hs.eg.db", 
                             feature_id_type = "ensembl_gene_id")
enrichmentPlot(enriched_go, label_wrap = 60)

## -----------------------------------------------------------------------------
length(enriched_go[["bp"]][["go.id"]])
length(enriched_go[["cc"]][["go.id"]])
length(enriched_go[["mf"]][["go.id"]])

## -----------------------------------------------------------------------------
head(enriched_go[["mf"]], n = 2)

## ----fig.cap = "Bar graph showing enriched pathways"--------------------------
library(reactome.db)

data(annotatedPeak)
enriched_path <- getEnrichedPATH(annotatedPeak,
                                 orgAnn = "org.Hs.eg.db",
                                 feature_id_type = "ensembl_gene_id",
                                 pathAnn = "reactome.db")

## ----fig.cap = "Histogram showing enriched pathways"--------------------------
enrichmentPlot(enriched_path)

## -----------------------------------------------------------------------------
library(BSgenome.Hsapiens.UCSC.hg38)

sequence_around_peak <- getAllPeakSequence(macs_peak_gr2, 
                                           upstream = 20,
                                           downstream = 20, 
                                           genome = BSgenome.Hsapiens.UCSC.hg38)
head(sequence_around_peak, n = 2)

## ----eval = FALSE-------------------------------------------------------------
# library(biomaRt)
# 
# mart <- useMart(biomart="ENSEMBL_MART_ENSEMBL",
#                 dataset="hsapiens_gene_ensembl")
# 
# sequence_around_peak <- getAllPeakSequence(macs_peak_gr2[1],
#                                            upstream = 20,
#                                            downstream = 20,
#                                            genome = mart)

## ----eval = FALSE-------------------------------------------------------------
# write2FASTA(sequence_around_peak, file = "macs_peak_gr2.fa")

## -----------------------------------------------------------------------------
freqs <- oligoFrequency(BSgenome.Hsapiens.UCSC.hg38$chr1)
motif_summary <- oligoSummary(sequence_around_peak, 
                              oligoLength = 6,
                              MarkovOrder = 3,
                              freqs = freqs,
                              quickMotif = TRUE)

## ----fig.cap = "Histogram showing Z-score distribution"-----------------------
zscore <- sort(motif_summary$zscore)
h <- hist(zscore, breaks = 100, main = "Histogram of Z-score")
text(x = zscore[length(zscore)], 
     y = h$counts[length(h$counts)] + 1, 
     labels = names(zscore[length(zscore)]), 
     adj = 0, 
     srt = 90)

## ----fig.cap = "Histogram showing Z-score distribution for simulation data"----
set.seed(1)

# motifs to simulate
simulate_motif_1 <- "AATTTA"
simulate_motif_2 <- "TGCATG"

# sample 5000 sequences with lengths ranging from 100 to 2000 nucleotides
# randomly insert motif_1 to 10% of the sequences, and motif_2 to 15% of the sequences
simulation_seqs <- sapply(sample(c(1, 2, 0), 
                                 5000,
                                 prob = c(0.1, 0.15, 0.75),
                                 replace = TRUE), 
                           function(x) {
                             seq <- sample(c("A", "T", "C", "G"),
                                           sample.int(1900, 1) + 100, 
                                           replace = TRUE)
                             insert_pos <- sample.int(length(seq) - 6, 1)
                             if (x == 1) {
                               seq[insert_pos:(insert_pos + 5)] <- strsplit(simulate_motif_1, "")[[1]]
                             } else if (x == 2) {
                               seq[insert_pos:(insert_pos + 5)] <- strsplit(simulate_motif_2, "")[[1]]
                             }
                             paste(seq, collapse = "")
                           }
)

motif_summary_simu <- oligoSummary(simulation_seqs, 
                                   oligoLength = 6, 
                                   MarkovOrder = 3, 
                                   quickMotif = TRUE)
zscore_simu <- sort(motif_summary_simu$zscore, 
                    decreasing = TRUE)
h_simu <- hist(x = zscore_simu, 
               breaks = 100, 
               main = "Histogram of Z-score for simulation data")
text(x = zscore_simu[1:2],  
     y = rep(5, 2), 
     labels = names(zscore_simu[1:2]), 
     adj = 0, 
     srt = 90)

## ----fig.small = TRUE---------------------------------------------------------
library(motifStack)

pfm <- new("pfm", mat = motif_summary_simu$motifs[[1]],
           name = "sample motif 1")
motifStack(pfm)

## ----fig.cap = "Plot showing the first motif detected", fig.small = TRUE------
pfms <- mapply(function(motif, id) { new("pfm", mat = motif, name = as.character(id)) },
               motif_summary$motifs,
               seq_along(motif_summary$motifs))

motifStack(pfms[[1]])

## -----------------------------------------------------------------------------
example_pattern_file <- system.file("extdata/examplePattern.fa",
                                    package = "ChIPpeakAnno")
readLines(example_pattern_file)
pattern_in_peak <- summarizePatternInPeaks(patternFilePath = example_pattern_file,
                                           BSgenomeName = BSgenome.Hsapiens.UCSC.hg38,
                                           peaks = macs_peak_gr2[1:200],
                                           bgdForPerm = "chromosome",
                                           nperm = 100, 
                                           method = "permutation.test")

pattern_in_peak$motif_enrichment
head(pattern_in_peak$motif_occurrence, n = 2)

## -----------------------------------------------------------------------------
tf1 <- toGRanges(system.file("extdata/TAF.broadPeak", package = "ChIPpeakAnno"),
                 format = "broadPeak")
tf2 <- toGRanges(system.file("extdata/Tead4.broadPeak", package = "ChIPpeakAnno"),
                 format = "broadPeak")
tf3 <- toGRanges(system.file("extdata/YY1.broadPeak", package = "ChIPpeakAnno"),
                 format = "broadPeak")

## ----results = "hide", fig.cap = "Venn diagram1 showing overlapping peaks", fig.small = TRUE----
overlapping_peaks <- findOverlapsOfPeaks(tf1, 
                                         tf2, 
                                         tf3, 
                                         connectedPeaks = "keepAll")
mean_peak_width <- mean(width(unlist(GRangesList(overlapping_peaks[["all.peaks"]]))))

total_binding_sites <- length(BSgenome.Hsapiens.UCSC.hg38[["chr2"]]) * 0.03 / mean_peak_width
venn1 <- makeVennDiagram(overlapping_peaks, 
                         totalTest = total_binding_sites, 
                         connectedPeaks = "keepAll", 
                         fill = c("#CC79A7", "#56B4E9", "#F0E442"),
                         col = c("#D55E00", "#0072B2", "#E69F00"),
                         cat.col = c("#D55E00", "#0072B2", "#E69F00"))

## -----------------------------------------------------------------------------
venn1[["p.value"]]

## -----------------------------------------------------------------------------
venn1[["vennCounts"]]

## ----results = "hide", fig.cap = "Venn diagram2 showing overlapping peaks", fig.small = TRUE----
venn2 <- makeVennDiagram(overlapping_peaks, 
                         totalTest = total_binding_sites, 
                         connectedPeaks = "keepFirstListConsistent", 
                         fill = c("#CC79A7", "#56B4E9", "#F0E442"),
                         col = c("#D55E00", "#0072B2", "#E69F00"),
                         cat.col = c("#D55E00", "#0072B2", "#E69F00"))

## ----eval = FALSE-------------------------------------------------------------
# library(TxDb.Hsapiens.UCSC.hg38.knownGene)
# 
# txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
# set.seed(123)
# 
# # Example1: non-relevant peak sets
# utr5 <- unique(unlist(fiveUTRsByTranscript(txdb)))
# utr3 <- unique(unlist(threeUTRsByTranscript(txdb)))
# 
# utr5 <- utr5[sample.int(length(utr5), 1000)]
# utr3 <- utr3[sample.int(length(utr3), 1000)]
# 
# pt1 <- peakPermTest(peaks1 = utr3,
#                     peaks2 = utr5,
#                     TxDb = txdb,
#                     maxgap = 500,
#                     seed = 1)
# plot(pt1)

## ----fig.cap = "Permutation test for non-relevant peak sets", echo = FALSE, out.width = "100%", out.height = "100%"----
png <- system.file("extdata", "permTest1.png", package = "ChIPpeakAnno")

knitr::include_graphics(png)

## ----eval = FALSE-------------------------------------------------------------
# # Example2: highly relevant peak sets
# cds <- unique(unlist(cdsBy(txdb)))
# ol <- findOverlaps(cds, utr3, maxgap = 1)
# peaks2 <- c(cds[sample.int(length(cds), 500)],
#             cds[queryHits(ol)][sample.int(length(ol), 500)])
# pt2 <- peakPermTest(peaks1 = utr3,
#                     peaks2 = peaks2,
#                     TxDb = txdb,
#                     maxgap = 500,
#                     seed = 1)
# plot(pt2)

## ----fig.cap = "Permutation test for highly relevant peak sets", echo = FALSE, out.width = "100%", out.height = "100%"----
png <- system.file("extdata", "permTest2.png", package = "ChIPpeakAnno")

knitr::include_graphics(png)

## ----eval = FALSE-------------------------------------------------------------
# # Step1: download TF binding sites
# temp <- tempfile()
# download.file(file.path("https://hgdownload.cse.ucsc.edu/",
#                         "goldenpath/",
#                         "hg38/",
#                         "encRegTfbsClustered/",
#                         "encRegTfbsClusteredWithCells.hg38.bed.gz"),
#               temp)
# df_tfbs <- read.delim(gzfile(temp, "r"), header = FALSE)
# unlink(temp)
# 
# colnames(df_tfbs)[1:4] <- c("seqnames", "start", "end", "TF")
# tfbs_hg38 <- GRanges(as.character(df_tfbs$seqnames),
#                      IRanges(df_tfbs$start, df_tfbs$end),
#                      TF = df_tfbs$TF)
# 
# # Step2: download hot spots
# base_url <- "http://metatracks.encodenets.gersteinlab.org/metatracks/"
# 
# temp1 <- tempfile()
# temp2 <- tempfile()
# download.file(file.path(base_url,
#                         "HOT_All_merged.tar.gz"),
#               temp1)
# download.file(file.path(base_url,
#                         "HOT_intergenic_All_merged.tar.gz"),
#               temp2)
# untar(temp1, exdir = dirname(temp1))
# untar(temp2, exdir = dirname(temp1))
# bedfiles <- dir(dirname(temp1), "bed$")
# hot_spots_hg19 <- sapply(file.path(dirname(temp1), bedfiles), toGRanges, format = "BED")
# unlink(temp1)
# unlink(temp2)
# 
# names(hot_spots_hg19) <- gsub("_merged.bed", "", bedfiles)
# hot_spots_hg19 <- sapply(hot_spots_hg19, unname)
# hot_spots_hg19 <- GRangesList(hot_spots_hg19)
# 
# # Step3: liftover hot spots to hg38
# library(R.utils)
# 
# temp_chain_gz <- tempfile()
# temp_chain <- tempfile()
# base_url_chain <- "http://hgdownload.cse.ucsc.edu/goldenpath/"
# download.file(file.path(base_url_chain,
#                         "hg19/liftOver/",
#                         "hg19ToHg38.over.chain.gz"),
#               temp_chain_gz)
# gunzip(filename = temp_chain_gz, destname = temp_chain)
# chain_file <- import.chain(hg19_to_hg38)
# unlink(temp_chain_gz)
# unlink(temp_chain)
# 
# hot_spots_hg38 <- liftOver(hot_spots_hg19, chain_file)
# 
# # Step4: select peak sets to test
# tfbs_hg38_by_TF <- split(tfbs_hg38, tfbs_hg38$TF)
# TAF1 <- tfbs_hg38_by_TF[["TAF1"]]
# TEAD4 <- tfbs_hg38_by_TF[["TEAD4"]]
# 
# # Step5: remove hot spots from binding pool
# tfbs_hg38 <- subsetByOverlaps(tfbs_hg38, hot_spots_hg38, invert=TRUE)
# tfbs_hg38 <- reduce(tfbs_hg38)
# 
# # Step6: perform permutation test
# pool <- new("permPool",
#             grs = GRangesList(tfbs_hg38),
#             N = length(TAF1))
# pt3 <- peakPermTest(TAF1, TEAD4, pool = pool, ntimes = 500)
# plot(pt3)

## ----fig.cap = "Permutation test using custom peak pool", echo = FALSE, out.width = "100%", out.height = "100%"----
png <- system.file("extdata", "permTest3.png", package = "ChIPpeakAnno")

knitr::include_graphics(png)

## ----fig.cap = "Heatmap of coverage densities for selected TFs"---------------
# Step1: read in example peak files
extdata_path <- system.file("extdata", package = "ChIPpeakAnno")
broadPeaks <- dir(extdata_path, "broadPeak")
gr_TFs <- sapply(file.path(extdata_path, broadPeaks), toGRanges, format = "broadPeak")
names(gr_TFs) <- gsub(".broadPeak", "", broadPeaks)
names(gr_TFs)

## -----------------------------------------------------------------------------
# Step2: find peaks that are shared by all
ol <- findOverlapsOfPeaks(gr_TFs)
gr_TFs_ol <- ol$peaklist$`TAF///Tead4///YY1`

# Step3: read in example coverage files
# here we read in coverage data from -2000bp to -2000bp of each shared peak center
gr_TFs_ol_center <- reCenterPeaks(gr_TFs_ol, width = 4000) # use the center of the peaks and extend 2000bp upstream and downstream to obtain peaks with uniform length of 4000bp

bigWigs <- dir(extdata_path, "bigWig")
coverage_list <- sapply(file.path(extdata_path, bigWigs), 
                        import, # rtracklayer::import
                        format = "BigWig",
                        which = gr_TFs_ol_center,
                        as = "RleList")

names(coverage_list) <- gsub(".bigWig", "", bigWigs)
names(coverage_list)

## ----fig.cap = "Heatmap of coverages for selected TFs", fig.height = 6, message = FALSE----
# Step4: visualize binding patterns
sig <- featureAlignedSignal(coverage_list, gr_TFs_ol_center)

# since the bigWig files are only a subset of the original files,
# filter to keep peaks that are with coverage data for all peak sets
keep <- rowSums(sig[[1]]) > 0 & rowSums(sig[[2]]) > 0 & rowSums(sig[[3]]) > 0
sig <- sapply(sig, function(x) x[keep, ], simplify = FALSE)
gr_TFs_ol_center <- gr_TFs_ol_center[keep]

featureAlignedHeatmap(sig, gr_TFs_ol_center,
                      upper.extreme=c(3, 0.5, 4))

## ----fig.cap = "Heatmap of coverages for selected TFs sorted by hierarchical clustering", fig.height = 6, message = FALSE----
# perform hierarchical clustering on rows
sig_rowsums <- sapply(sig, rowSums, na.rm = TRUE)
row_distance <- dist(sig_rowsums)
hc <- hclust(row_distance)

# use hierarchical clustering order to sort
gr_TFs_ol_center$sort_by <- hc$order
featureAlignedHeatmap(sig, gr_TFs_ol_center, 
                      upper.extreme = c(3, 0.5, 4),
                      sortBy = "sort_by")

## ----fig.cap = "Distribution of coverage densities for selected TFs", message = FALSE----
featureAlignedDistribution(sig, gr_TFs_ol_center,
                           type = "l")

## ----message = FALSE, warning = FALSE-----------------------------------------
library(ChIPpeakAnno)

# Convert BED/GFF into GRanges
bed1 <- system.file("extdata", "MACS_output_hg38.bed", 
                    package = "ChIPpeakAnno")
gr1 <- toGRanges(bed1, format = "BED", header = FALSE)
gff1 <- system.file("extdata", "GFF_peaks_hg38.gff", 
                    package = "ChIPpeakAnno")
gr2 <- toGRanges(gff1, format = "GFF", header = FALSE)

# Find overlapping peaks
ol <- findOverlapsOfPeaks(gr1, gr2)

# Add "score" metadata column to overlapping peaks
ol <- addMetadata(ol, colNames = "score", FUN = mean) 
head(ol$mergedPeaks, n = 2)

## ----results = "hide", fig.cap = "Venn diagram showing the number of overlapping peaks for gr1 and gr2", fig.small = TRUE----
venn <- makeVennDiagram(ol,
                        fill = c("#009E73", "#F0E442"),
                        col = c("#D55E00", "#0072B2"),
                        cat.col = c("#D55E00", "#0072B2"))

## -----------------------------------------------------------------------------
venn[["p.value"]]

## -----------------------------------------------------------------------------
venn[["vennCounts"]]

## ----warning = FALSE----------------------------------------------------------
library(EnsDb.Hsapiens.v86)

ensembl.hs86.gene <- toGRanges(EnsDb.Hsapiens.v86, feature = "gene")
head(ensembl.hs86.gene, n = 2)

## ----fig.cap = "Peak count distribution around transcription start sites", warning = FALSE----
binOverFeature(ol$mergedPeaks, 
               nbins = 20,
               annotationData = ensembl.hs86.gene,
               xlab = "peak distance from TSSs (bp)", 
               ylab = "peak count", 
               main = "Distribution of aggregated peak numbers around TSS")

## ----fig.cap = "Pie graph showing peak distributions over different genomic features"----
library(TxDb.Hsapiens.UCSC.hg38.knownGene)

genomicElementDistribution(ol$mergedPeaks, 
                           TxDb = TxDb.Hsapiens.UCSC.hg38.knownGene)

## ----fig.cap = "Bar graph showing peak distributions over different genomic features for two replicates"----
macs_peaks <- GRangesList(rep1 = gr1,
                          rep2 = gr2)
genomicElementDistribution(macs_peaks, 
                           TxDb = TxDb.Hsapiens.UCSC.hg38.knownGene)

## ----fig.cap = "UpSet plot showing peak overlappings for multiple features"----
library(UpSetR)

res <- genomicElementUpSetR(ol$mergedPeak,
                            TxDb.Hsapiens.UCSC.hg38.knownGene)
upset(res[["plotData"]], 
      nsets = length(colnames(res$plotData)), 
      nintersects = NA)

## -----------------------------------------------------------------------------
ol_anno <- annotatePeakInBatch(ol$mergedPeak, 
                               AnnotationData = ensembl.hs86.gene, 
                               output = "nearestBiDirectionalPromoters",
                               bindingRegion = c(-2000, 500))
head(ol_anno, n = 2)

## ----eval = FALSE-------------------------------------------------------------
# ol_anno <- unname(ol_anno) # remove names to avoid duplicate row.names error
# ol_anno$peakNames <- NULL # remove peakNames to avoid unimplemented type 'list' error
# write.csv(as.data.frame(ol_anno), "ol_anno.csv")

## ----fig.cap = "Pie chart showing peak distribution around features"----------
pie1(table(ol_anno$insideFeature))

## -----------------------------------------------------------------------------
peaks_near_BDP <- peaksNearBDP(ol$mergedPeaks, 
                    AnnotationData = ensembl.hs86.gene, 
                    MaxDistance = 5000) 
# MaxDistance will be translated into "bindingRegion = 
# c(-MaxDistance, MaxDistance)" internally

peaks_near_BDP$n.peaks
peaks_near_BDP$n.peaksWithBDP
peaks_near_BDP$percentPeaksWithBDP

head(peaks_near_BDP$peaksWithBDP, n = 2)

## -----------------------------------------------------------------------------
library(EnsDb.Hsapiens.v75)

ensembl.hs75.gene <- toGRanges(EnsDb.Hsapiens.v75, feature = "gene")

DNA5C <- system.file("extdata", 
                     "wgEncodeUmassDekker5CGm12878PkV2.bed.gz",
                     package="ChIPpeakAnno")
DNAinteractiveData <- toGRanges(gzfile(DNA5C))
# the example bed.gz file can also be downloaded from:
# https://hgdownload.cse.ucsc.edu/goldenpath/hg19/encodeDCC/wgEncodeUmassDekker5C/wgEncodeUmassDekker5CGm12878PkV2.bed.gz

peaks_near_enhancer <-  findEnhancers(peaks = ol$mergedPeaks,
                                      annoData = ensembl.hs75.gene, 
                                      DNAinteractiveData = DNAinteractiveData)
head(peaks_near_enhancer, n = 2)

## ----fig.cap = "Histogram showing enriched GO terms"--------------------------
library(org.Hs.eg.db)

enriched_go <- getEnrichedGO(annotatedPeak = ol_anno, 
                             orgAnn = "org.Hs.eg.db", 
                             feature_id_type = "ensembl_gene_id",
                             condense = TRUE)
enrichmentPlot(enriched_go)

## ----fig.cap = "Histogram showing enriched pathways"--------------------------
library(reactome.db)

enriched_pathway <- getEnrichedPATH(annotatedPeak = ol_anno,
                                    orgAnn = "org.Hs.eg.db", 
                                    pathAnn = "KEGGREST",
                                    maxP = 0.05)
enrichmentPlot(enriched_pathway)

# To remove the common suffix " - Homo sapiens (human)":
enrichmentPlot(enriched_pathway, label_substring_to_remove = " - Homo sapiens \\(human\\)")

## -----------------------------------------------------------------------------
library(BSgenome.Hsapiens.UCSC.hg38)

seq_around_peak <- getAllPeakSequence(ol$mergedPeaks, 
                                      upstream = 20,
                                      downstream = 20, 
                                      genome = BSgenome.Hsapiens.UCSC.hg38)
head(seq_around_peak, n = 2)

## ----fig.cap = "Histogram showing Z-score distribution for oligos of length 6"----
freqs <- oligoFrequency(BSgenome.Hsapiens.UCSC.hg38$chr1)
motif_summary <- oligoSummary(seq_around_peak, 
                              oligoLength = 6,
                              MarkovOrder = 3,
                              freqs = freqs,
                              quickMotif = TRUE)
zscore <- sort(motif_summary$zscore)
h <- hist(zscore, 
          breaks = 100, 
          main = "Histogram of Z-score")
text(x = zscore[length(zscore)], 
     y = h$counts[length(h$counts)] + 1, 
     labels = names(zscore[length(zscore)]), 
     adj = 0, 
     srt = 90)

## ----fig.cap = "Sequence log of detected motif 1", fig.small = TRUE-----------
library(motifStack)

pfm <- new("pfm", mat = motif_summary$motifs[[1]],
           name = "sample motif 1")
motifStack(pfm)

## ----warning = FALSE----------------------------------------------------------
tf1 <- toGRanges(system.file("extdata/TAF.broadPeak", package = "ChIPpeakAnno"),
                 format = "broadPeak")
tf2 <- toGRanges(system.file("extdata/Tead4.broadPeak", package = "ChIPpeakAnno"),
                 format = "broadPeak")
tf3 <- toGRanges(system.file("extdata/YY1.broadPeak", package = "ChIPpeakAnno"),
                 format = "broadPeak")

## ----results = "hide", fig.cap = "Venn diagram showing overlapping peaks for three TFs", fig.small = TRUE----
library(BSgenome.Hsapiens.UCSC.hg38)

overlapping_peaks <- findOverlapsOfPeaks(tf1, 
                                         tf2, 
                                         tf3, 
                                         connectedPeaks = "keepAll")
mean_peak_width <- mean(width(unlist(GRangesList(overlapping_peaks[["all.peaks"]]))))

total_binding_sites <- length(BSgenome.Hsapiens.UCSC.hg38[["chr2"]]) * 0.03 / mean_peak_width
venn1 <- makeVennDiagram(overlapping_peaks, 
                         totalTest = total_binding_sites, 
                         connectedPeaks = "keepAll", 
                         fill = c("#CC79A7", "#56B4E9", "#F0E442"),
                         col = c("#D55E00", "#0072B2", "#E69F00"),
                         cat.col = c("#D55E00", "#0072B2", "#E69F00"))

## -----------------------------------------------------------------------------
venn1[["p.value"]]

## ----fig.cap = "Heatmap of coverages for selected TFs", fig.height = 6, message = FALSE----
ol_tfs <- findOverlapsOfPeaks(tf1, tf2, tf3, 
                              connectedPeaks = "keepAll")
gr_ol_tfs <- ol_tfs$peaklist$`tf1///tf2///tf3`

TF_width <- width(gr_ol_tfs)
gr_ol_tfs_center <- reCenterPeaks(gr_ol_tfs, width = 4000)

extdata_path <- system.file("extdata", package = "ChIPpeakAnno")
bigWigs <- dir(extdata_path, "bigWig")
coverage_list <- sapply(file.path(extdata_path, bigWigs), 
                        import, # rtracklayer::import
                        format = "BigWig",
                        which = gr_ol_tfs_center,
                        as = "RleList")

names(coverage_list) <- gsub(".bigWig", "", bigWigs)

sig <- featureAlignedSignal(coverage_list, gr_ol_tfs_center)
# since the bigWig files are only a subset of the original files,
# filter to keep peaks that are with coverage data for all peak sets
keep <- rowSums(sig[[1]]) > 0 & rowSums(sig[[2]]) > 0 & rowSums(sig[[3]]) > 0
sig <- sapply(sig, function(x) x[keep, ], simplify = FALSE)
gr_ol_tfs_center <- gr_ol_tfs_center[keep]

featureAlignedHeatmap(sig, gr_ol_tfs_center,
                      upper.extreme=c(3, 0.5, 4))

## ----fig.cap = "Heatmap of coverages for selected TFs sorted by YY1 sample", fig.height = 6, message = FALSE----
featureAlignedHeatmap(sig, gr_ol_tfs_center,
                      upper.extreme=c(3, 0.5, 4),
                      sortBy = "YY1")

## ----fig.cap = "Distribution of coverage densities for selected TFs", message = FALSE----
featureAlignedDistribution(sig, gr_ol_tfs_center, 
                           type="l")

## ----warning = FALSE----------------------------------------------------------
library(ensembldb)
library(EnsDb.Hsapiens.v75)
data(myPeakList)
annoData <- annoGR(EnsDb.Hsapiens.v75)

# Step1: annotate peaks to the overlapping features, if "select = 'all'", multiple features can be assigned to a single peak.
anno_overlapping <- annotatePeakInBatch(myPeakList, 
                                        AnnotationData = annoData, 
                                        output = "overlapping", 
                                        select = "first")
anno_overlapping_non_na <- anno_overlapping[!is.na(anno_overlapping$feature)]

# Step2: annotate peaks that are without overlapping features to nearest features
myPeakList_non_overlapping <- myPeakList[!(names(myPeakList) %in% anno_overlapping_non_na$peak)]  
anno_nearest <- annotatePeakInBatch(myPeakList_non_overlapping,
                                    AnnotationData = annoData, 
                                    output = "nearestLocation", 
                                    select = "first")

# Step3: concatenate the two
anno_final <- c(anno_overlapping_non_na, anno_nearest)

## ----warning = FALSE----------------------------------------------------------
library(ChIPpeakAnno)
library(reshape2)

# Step1: read in two peak files
bed <- system.file("extdata", 
                   "MACS_output.bed",
                   package = "ChIPpeakAnno")
gff <- system.file("extdata", 
                   "GFF_peaks.gff", 
                   package = "ChIPpeakAnno")
gr1 <- toGRanges(bed, format = "BED", 
                 header = FALSE)
gr2 <- toGRanges(gff, format = "GFF", 
                 header = FALSE, skip = 3)
names(gr2) <- seq(length(gr2)) # add names to gr2 for Step4

# Step2: find overlapping peaks
ol <- findOverlapsOfPeaks(gr1, gr2)
peakNames <- ol$peaklist[['gr1///gr2']]$peakNames

# Step3: extract original peak names
peakNames <- melt(peakNames, 
                  value.name = "merged_peak_id") # reshape df
head(peakNames, n = 2)

peakNames <- cbind(peakNames[, 1], 
                   do.call(rbind, 
                           strsplit(as.character(peakNames[, 3]), "__")))

colnames(peakNames) <- c("merged_peak_id", "group", "peakName")
head(peakNames, n = 2)

# Step4: split by peak group
gr1_subset <- gr1[peakNames[peakNames[, "group"] %in% "gr1", "peakName"]]
gr2_subset <- gr2[peakNames[peakNames[, "group"] %in% "gr2", "peakName"]]
head(gr1_subset, n = 2)
head(gr2_subset, n = 2)

## -----------------------------------------------------------------------------
gr1_renamed <- ol$all.peaks$gr1
gr2_renamed <- ol$all.peaks$gr2
head(gr1_renamed, n = 2)
head(gr2_renamed, n = 2)

peakNames <- melt(ol$peaklist[['gr1///gr2']]$peakNames, 
                  value.name = "merged_peak_id")
gr1_subset <- gr1_renamed[peakNames[grepl("^gr1", peakNames[, 3]), 3]]
gr2_subset <- gr2_renamed[peakNames[grepl("^gr2", peakNames[, 3]), 3]]
head(gr1_subset, n = 2)
head(gr2_subset, n = 2)

## ----citation, echo = FALSE---------------------------------------------------
citation(package = "ChIPpeakAnno")

## ----sessionInfo, echo = FALSE------------------------------------------------
sessionInfo()

