## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = TRUE,
  fig.width = 12,
  fig.height = 6.75
)

## ----setup, message = F-------------------------------------------------------
library(methodical)
library(DESeq2)
library(TumourMethData)

## ----eval=FALSE---------------------------------------------------------------
# # Installing Methodical
# if (!require("BiocManager"))
#     install.packages("BiocManager")
# BiocManager::install("methodical")

## -----------------------------------------------------------------------------
# Download mcrpc_wgbs_hg38 from TumourMethDatasets.
mcrpc_wgbs_hg38_chr11 <- TumourMethData::download_meth_dataset(dataset = "mcrpc_wgbs_hg38_chr11")

# Download transcript counts
mcrpc_transcript_counts_file <- TumourMethData::download_rnaseq_dataset(dataset = "mcrpc_wgbs_hg38_chr11")
mcrpc_transcript_counts = data.frame(data.table::fread(mcrpc_transcript_counts_file), row.names = 1)

## -----------------------------------------------------------------------------
# Download Gencode annotation 
library(AnnotationHub)
transcript_annotation <- AnnotationHub()[["AH75119"]]

# Import the Gencode annotation and subset for transcript annotation for non-mitochondrial protein-coding genes
transcript_annotation <- transcript_annotation[transcript_annotation$type == "transcript"]
transcript_annotation <-  transcript_annotation[transcript_annotation$transcript_type == "protein_coding"]

# Remove transcript version from ID
transcript_annotation$ID <- gsub("\\.[0-9]*", "", transcript_annotation$ID)

# Filter for transcripts on chromosome 11
transcript_annotation_chr11 <- transcript_annotation[seqnames(transcript_annotation) == "chr11"]

# Filter for transcripts in the counts table
transcript_annotation_chr11 <- transcript_annotation_chr11[transcript_annotation_chr11$ID %in% row.names(mcrpc_transcript_counts)]

# Set transcript ID as names of ranges
names(transcript_annotation_chr11) <- transcript_annotation_chr11$ID

# Get the TSS for each transcript set ID as names
transcript_tss_chr11 <- resize(transcript_annotation_chr11, 1, fix = "start")
rm(transcript_annotation)

# Get the region surrounding each TSS
tss_proximal_regions_chr11 = methodical::expand_granges(transcript_tss_chr11, upstream = 5000, downstream = 5000)

## -----------------------------------------------------------------------------
# Subset mcrpc_transcript_counts for protein-coding transcripts
mcrpc_transcript_counts <- mcrpc_transcript_counts[transcript_tss_chr11$ID, ]

# Create a DESeqDataSet from counts and normalize. 
mcrpc_transcript_counts_dds <- DESeqDataSetFromMatrix(countData = mcrpc_transcript_counts, 
    colData = data.frame(sample = names(mcrpc_transcript_counts)), design = ~ 1)
mcrpc_transcript_counts_dds <- estimateSizeFactors(mcrpc_transcript_counts_dds) 
mcrpc_transcript_counts_normalized <- data.frame(counts(mcrpc_transcript_counts_dds, normalized = TRUE))

## -----------------------------------------------------------------------------
# Find samples with both WGBS and RNA-seq count data
common_mcprc_samples <- intersect(names(mcrpc_transcript_counts), colnames(mcrpc_wgbs_hg38_chr11))

# Create a BPPARAM class
BPPARAM = BiocParallel::SerialParam()

# Calculate CpG methylation-transcription correlations for all TSS on chromosome 11
system.time({transcript_cpg_meth_cors_mcrpc_samples_5kb <- calculateMethSiteTranscriptCors(
  meth_rse = mcrpc_wgbs_hg38_chr11, 
  transcript_expression_table = mcrpc_transcript_counts_normalized, 
  samples_subset = common_mcprc_samples, 
  tss_gr = transcript_tss_chr11, tss_associated_gr = tss_proximal_regions_chr11,
  cor_method = "pearson", max_sites_per_chunk = NULL, BPPARAM = BPPARAM)})

## -----------------------------------------------------------------------------
# Extract correlation results for ENST00000398606 transcript
gstp1_cpg_meth_transcript_cors <- transcript_cpg_meth_cors_mcrpc_samples_5kb[["ENST00000398606"]]

# Examine first few rows of the results
head(gstp1_cpg_meth_transcript_cors)

# Extract TSS for the transcript 
attributes(gstp1_cpg_meth_transcript_cors)$tss_range

## ----fig.show='hide'----------------------------------------------------------
# Plot methylation-transcription correlation values for GSTP1 showing 
# chromosome 11 position on the x-axis
meth_cor_plot_absolute_distance <- plotMethSiteCorCoefs(
  meth_site_cor_values = gstp1_cpg_meth_transcript_cors, 
  reference_tss = FALSE, 
  ylabel = "Correlation Value", 
  title = "Correlation Between GSTP1 Transcription and CpG Methylation")
print(meth_cor_plot_absolute_distance)

# Plot methylation-transcription correlation values for GSTP1 showing 
# distance to the GSTP1 on the x-axis
meth_cor_plot_relative_distance <- plotMethSiteCorCoefs(
  meth_site_cor_values = gstp1_cpg_meth_transcript_cors, 
  reference_tss = TRUE,   
  ylabel = "Correlation Value", 
  title = "Correlation Between GSTP1 Transcription and CpG Methylation")
print(meth_cor_plot_relative_distance)

## ----fig.show='hide'----------------------------------------------------------
# Add a dashed line to meth_cor_plot_relative_distance where the correlation is 0
meth_cor_plot_relative_distance + ggplot2::geom_hline(yintercept = 0, linetype = "dashed")

## ----fig.show='hide'----------------------------------------------------------
# Calculate gene body methylation-transcription correlations for all TSS on chromosome 11
system.time({transcript_body_meth_cors_mcrpc_samples <- calculateRegionMethylationTranscriptCors(
  meth_rse = mcrpc_wgbs_hg38_chr11, 
  transcript_expression_table = mcrpc_transcript_counts_normalized, 
  samples_subset = common_mcprc_samples, 
  genomic_regions = transcript_annotation_chr11, 
  genomic_region_transcripts = transcript_annotation_chr11$ID,
  region_methylation_summary_function = colMeans, 
  cor_method = "pearson", BPPARAM = BPPARAM)})

# Show the top of the results table
head(transcript_body_meth_cors_mcrpc_samples)

# We'll filter for significant correlation values and plot their distribution
transcript_body_meth_cors_significant <- 
dplyr::filter(transcript_body_meth_cors_mcrpc_samples, q_val < 0.05)

ggplot(transcript_body_meth_cors_significant, aes(x = cor)) +
geom_histogram() + theme_bw() + 
scale_y_continuous(expand = expansion(mult = c(0, 0.2), add = 0)) +
labs(x = "Correlation Values", y = "Number of Significant Correlation Values")
  

## -----------------------------------------------------------------------------
sessionInfo()

