## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.align = "center",
  message = FALSE,
  warning = FALSE,
  out.width = "100%"
)

# Set seed for reproducibility
set.seed(42)

## ----eval=FALSE---------------------------------------------------------------
# # Install from GitHub
# devtools::install_github("Zaoqu-Liu/SVG")
# 
# # Install all optional dependencies
# install.packages(c("geometry", "RANN", "BRISC", "CompQuadForm",
#                    "spatstat.geom", "spatstat.explore"))

## ----load-package-------------------------------------------------------------
# Load the SVG package
library(SVG)

# Load the built-in example dataset
data(example_svg_data)

# Display dataset structure
cat("Dataset components:\n")
names(example_svg_data)

## ----data-summary-------------------------------------------------------------
# Extract components
expr_counts <- example_svg_data$counts        # Raw counts
expr_log <- example_svg_data$logcounts        # Log-normalized
coords <- example_svg_data$spatial_coords     # Spatial coordinates
gene_info <- example_svg_data$gene_info       # Gene metadata

cat("Data Dimensions:\n")
cat("  Number of spots:", ncol(expr_counts), "\n")
cat("  Number of genes:", nrow(expr_counts), "\n")
cat("  True SVGs:", sum(gene_info$is_svg), "\n")
cat("  Non-SVGs:", sum(!gene_info$is_svg), "\n")
cat("\nSpatial Pattern Types:\n")
print(table(gene_info$pattern_type))

## ----spatial-layout, fig.width=7, fig.height=6--------------------------------
oldpar <- par(mar = c(4, 4, 3, 2))
plot(coords[, 1], coords[, 2],
     pch = 19, cex = 0.9,
     col = adjustcolor("steelblue", alpha.f = 0.7),
     xlab = "X coordinate (μm)",
     ylab = "Y coordinate (μm)",
     main = "Spatial Spot Layout (Hexagonal Grid)",
     asp = 1)
grid(lty = 2, col = "gray80")

# Add spot count annotation
mtext(paste("n =", nrow(coords), "spots"), side = 3, line = 0.3, cex = 0.9)
par(oldpar)

## ----expr-distribution, fig.width=10, fig.height=4----------------------------
oldpar <- par(mfrow = c(1, 2), mar = c(4, 4, 3, 1))

# Raw counts distribution
hist(log10(rowMeans(expr_counts) + 1), breaks = 30,
     col = "lightblue", border = "white",
     xlab = expression(log[10](Mean~Count + 1)),
     ylab = "Number of Genes",
     main = "Gene Expression Distribution")

# Spot library size
hist(log10(colSums(expr_counts)), breaks = 30,
     col = "lightgreen", border = "white",
     xlab = expression(log[10](Library~Size)),
     ylab = "Number of Spots",
     main = "Spot Library Size Distribution")
par(oldpar)

## ----pattern-visualization, fig.width=12, fig.height=10-----------------------
# Define color palette for expression visualization
expr_colors <- function(x, pal = "RdYlBu") {
  x_scaled <- (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE) + 1e-10)
  if (pal == "RdYlBu") {
    colors <- colorRampPalette(c("#313695", "#4575B4", "#74ADD1", "#ABD9E9", 
                                  "#E0F3F8", "#FFFFBF", "#FEE090", "#FDAE61", 
                                  "#F46D43", "#D73027", "#A50026"))(100)
  } else {
    colors <- colorRampPalette(c("navy", "white", "firebrick3"))(100)
  }
  colors[pmax(1, ceiling(x_scaled * 99) + 1)]
}

# Get example genes for each pattern type
pattern_types <- c("gradient", "hotspot", "periodic", "cluster")
pattern_genes <- sapply(pattern_types, function(pt) {
  which(gene_info$pattern_type == pt)[1]
})

# Get non-SVG example
non_svg_gene <- which(!gene_info$is_svg)[1]

oldpar <- par(mfrow = c(2, 3), mar = c(4, 4, 3, 1))

# Plot each pattern type
for (i in seq_along(pattern_types)) {
  gene_idx <- pattern_genes[i]
  gene_name <- rownames(expr_log)[gene_idx]
  gene_expr <- expr_log[gene_idx, ]
  
  plot(coords[, 1], coords[, 2],
       pch = 19, cex = 1.3,
       col = expr_colors(gene_expr),
       xlab = "X", ylab = "Y",
       main = paste0(gene_name, "\n(", pattern_types[i], " pattern)"),
       asp = 1)
}

# Plot non-SVG
gene_expr <- expr_log[non_svg_gene, ]
plot(coords[, 1], coords[, 2],
     pch = 19, cex = 1.3,
     col = expr_colors(gene_expr),
     xlab = "X", ylab = "Y",
     main = paste0(rownames(expr_log)[non_svg_gene], "\n(non-SVG, random)"),
     asp = 1)

# Add color legend
plot.new()
par(mar = c(1, 1, 1, 1))
legend_colors <- colorRampPalette(c("#313695", "#FFFFBF", "#A50026"))(100)
legend_image <- as.raster(matrix(rev(legend_colors), ncol = 1))
plot(c(0, 1), c(0, 1), type = "n", axes = FALSE, xlab = "", ylab = "")
text(0.5, 0.95, "Expression Level", cex = 1.2, font = 2)
rasterImage(legend_image, 0.3, 0.1, 0.7, 0.85)
text(0.75, 0.1, "Low", cex = 0.9)
text(0.75, 0.85, "High", cex = 0.9)
par(oldpar)

## ----meringue-run-------------------------------------------------------------
# Run MERINGUE with KNN network
results_meringue <- CalSVG_MERINGUE(
  expr_matrix = expr_log,
  spatial_coords = coords,
  network_method = "knn",       # Network construction method
  k = 10,                       # Number of neighbors
  alternative = "greater",      # Test for positive autocorrelation
  adjust_method = "BH",         # Benjamini-Hochberg correction
  verbose = FALSE
)

# Display top SVGs
cat("Top 10 SVGs by MERINGUE:\n")
head(results_meringue[, c("gene", "observed", "expected", "z_score", "p.value", "p.adj")], 10)

## ----meringue-viz, fig.width=10, fig.height=4---------------------------------
oldpar <- par(mfrow = c(1, 2), mar = c(4, 4, 3, 1))

# Moran's I distribution
hist(results_meringue$observed, breaks = 40,
     col = "steelblue", border = "white",
     xlab = "Moran's I",
     ylab = "Number of Genes",
     main = "Distribution of Moran's I Statistics")
abline(v = results_meringue$expected[1], col = "red", lwd = 2, lty = 2)
legend("topright", legend = "E[I] under null", col = "red", lty = 2, lwd = 2)

# P-value distribution
hist(results_meringue$p.value, breaks = 40,
     col = "coral", border = "white",
     xlab = "P-value",
     ylab = "Number of Genes",
     main = "P-value Distribution")
abline(v = 0.05, col = "darkred", lwd = 2, lty = 2)
par(oldpar)

## ----binspect-run-------------------------------------------------------------
# Run binSpect with k-means binarization
results_binspect <- CalSVG_binSpect(
  expr_matrix = expr_log,
  spatial_coords = coords,
  bin_method = "kmeans",        # Binarization method
  network_method = "delaunay",  # Network construction
  do_fisher_test = TRUE,        # Perform Fisher's test
  adjust_method = "fdr",        # FDR correction
  verbose = FALSE
)

# Display top SVGs
cat("Top 10 SVGs by binSpect:\n")
head(results_binspect[, c("gene", "estimate", "p.value", "p.adj", "score")], 10)

## ----binspect-viz, fig.width=10, fig.height=4---------------------------------
oldpar <- par(mfrow = c(1, 2), mar = c(4, 4, 3, 1))

# Odds ratio distribution
or_log <- log2(results_binspect$estimate + 0.01)
hist(or_log, breaks = 40,
     col = "mediumpurple", border = "white",
     xlab = expression(log[2](Odds~Ratio)),
     ylab = "Number of Genes",
     main = "Distribution of Odds Ratios")
abline(v = 0, col = "red", lwd = 2, lty = 2)

# Score distribution (combines p-value and OR)
hist(results_binspect$score, breaks = 40,
     col = "darkorange", border = "white",
     xlab = "binSpect Score",
     ylab = "Number of Genes",
     main = "Distribution of binSpect Scores")
par(oldpar)

## ----sparkx-run---------------------------------------------------------------
# Run SPARK-X with mixture of kernels
# Note: Uses raw counts (not log-transformed)
results_sparkx <- CalSVG_SPARKX(
  expr_matrix = expr_counts,    # Raw counts recommended
  spatial_coords = coords,
  kernel_option = "mixture",    # All 11 kernels
  adjust_method = "BY",         # Benjamini-Yekutieli (conservative)
  verbose = FALSE
)

# Display top SVGs
cat("Top 10 SVGs by SPARK-X:\n")
head(results_sparkx[, c("gene", "p.value", "p.adj")], 10)

## ----sparkx-viz, fig.width=10, fig.height=4-----------------------------------
oldpar <- par(mfrow = c(1, 2), mar = c(4, 4, 3, 1))

# Combined p-value distribution
hist(-log10(results_sparkx$p.value + 1e-300), breaks = 40,
     col = "seagreen", border = "white",
     xlab = expression(-log[10](p-value)),
     ylab = "Number of Genes",
     main = "SPARK-X P-value Distribution")

# Volcano-style plot
pval_log <- -log10(results_sparkx$p.adj + 1e-300)
plot(seq_along(pval_log), pval_log,
     pch = 19, cex = 0.6,
     col = ifelse(pval_log > -log10(0.05), "red", "gray50"),
     xlab = "Gene Index",
     ylab = expression(-log[10](adjusted~p-value)),
     main = "SPARK-X Significance Plot")
abline(h = -log10(0.05), col = "red", lty = 2)
par(oldpar)

## ----seurat-run---------------------------------------------------------------
# Run Seurat Moran's I
results_seurat <- CalSVG_Seurat(
  expr_matrix = expr_log,
  spatial_coords = coords,
  weight_scheme = "inverse_squared",  # Seurat default
  adjust_method = "BH",
  verbose = FALSE
)

# Display top SVGs
cat("Top 10 SVGs by Seurat:\n")
head(results_seurat[, c("gene", "observed", "expected", "sd", "p.value", "p.adj")], 10)

## ----calsvg-unified-----------------------------------------------------------
# Example: Run MERINGUE through unified interface
results_unified <- CalSVG(
  expr_matrix = expr_log,
  spatial_coords = coords,
  method = "meringue",       # Options: meringue, binspect, sparkx, seurat, nnsvg, markvario
  network_method = "knn",
  k = 10,
  verbose = FALSE
)

cat("Unified interface results:\n")
head(results_unified[, c("gene", "p.value", "p.adj")], 5)

## ----metrics-calculation------------------------------------------------------
# Ground truth
truth <- gene_info$is_svg

# Function to calculate comprehensive metrics
calc_performance <- function(result, truth, pval_col = "p.adj", threshold = 0.05) {
  detected <- result[[pval_col]] < threshold
  detected[is.na(detected)] <- FALSE
  
  TP <- sum(truth & detected)
  FP <- sum(!truth & detected)
  FN <- sum(truth & !detected)
  TN <- sum(!truth & !detected)
  
  list(
    TP = TP, FP = FP, FN = FN, TN = TN,
    Sensitivity = TP / (TP + FN),
    Specificity = TN / (TN + FP),
    Precision = TP / max(TP + FP, 1),
    NPV = TN / max(TN + FN, 1),
    F1 = 2 * TP / max(2 * TP + FP + FN, 1),
    Accuracy = (TP + TN) / (TP + TN + FP + FN),
    MCC = (TP * TN - FP * FN) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN) + 1e-10)
  )
}

# Calculate metrics for each method
metrics_list <- list(
  MERINGUE = calc_performance(results_meringue, truth),
  binSpect = calc_performance(results_binspect, truth),
  `SPARK-X` = calc_performance(results_sparkx, truth),
  Seurat = calc_performance(results_seurat, truth)
)

# Create metrics data frame
metrics_df <- do.call(rbind, lapply(names(metrics_list), function(m) {
  data.frame(
    Method = m,
    Sensitivity = metrics_list[[m]]$Sensitivity,
    Specificity = metrics_list[[m]]$Specificity,
    Precision = metrics_list[[m]]$Precision,
    F1 = metrics_list[[m]]$F1,
    MCC = metrics_list[[m]]$MCC
  )
}))

knitr::kable(metrics_df, digits = 3, 
             caption = "Performance Comparison on Simulated Data (FDR < 0.05)")

## ----performance-heatmap, fig.width=9, fig.height=5---------------------------
# Prepare data for heatmap
metrics_matrix <- as.matrix(metrics_df[, -1])
rownames(metrics_matrix) <- metrics_df$Method

# Create heatmap
oldpar <- par(mar = c(5, 8, 4, 6))
image(t(metrics_matrix), axes = FALSE,
      col = colorRampPalette(c("#440154", "#31688E", "#35B779", "#FDE725"))(100),
      main = "Performance Metrics Heatmap")

axis(1, at = seq(0, 1, length.out = ncol(metrics_matrix)),
     labels = colnames(metrics_matrix), las = 2, cex.axis = 0.9)
axis(2, at = seq(0, 1, length.out = nrow(metrics_matrix)),
     labels = rownames(metrics_matrix), las = 1, cex.axis = 0.9)

# Add values
for (i in 1:nrow(metrics_matrix)) {
  for (j in 1:ncol(metrics_matrix)) {
    text((j - 1) / (ncol(metrics_matrix) - 1),
         (i - 1) / (nrow(metrics_matrix) - 1),
         sprintf("%.2f", metrics_matrix[i, j]),
         cex = 0.9, col = ifelse(metrics_matrix[i, j] > 0.6, "white", "black"))
  }
}
par(oldpar)

## ----roc-analysis, fig.width=8, fig.height=7----------------------------------
# Function to compute ROC curve
compute_roc <- function(scores, truth, higher_is_better = FALSE) {
  if (higher_is_better) {
    scores <- -scores
  }
  scores[is.na(scores)] <- max(scores, na.rm = TRUE) + 1
  
  thresholds <- sort(unique(c(-Inf, scores, Inf)))
  
  tpr <- fpr <- numeric(length(thresholds))
  for (i in seq_along(thresholds)) {
    predicted <- scores <= thresholds[i]
    tpr[i] <- sum(truth & predicted) / sum(truth)
    fpr[i] <- sum(!truth & predicted) / sum(!truth)
  }
  
  # Calculate AUC using trapezoidal rule
  ord <- order(fpr, tpr)
  fpr <- fpr[ord]
  tpr <- tpr[ord]
  auc <- sum(diff(fpr) * (head(tpr, -1) + tail(tpr, -1)) / 2)
  
  list(fpr = fpr, tpr = tpr, auc = auc)
}

# Compute ROC for each method
roc_meringue <- compute_roc(results_meringue$p.value, truth)
roc_binspect <- compute_roc(results_binspect$p.value, truth)
roc_sparkx <- compute_roc(results_sparkx$p.value, truth)
roc_seurat <- compute_roc(results_seurat$p.value, truth)

# Plot ROC curves
oldpar <- par(mar = c(5, 5, 4, 2))
plot(roc_meringue$fpr, roc_meringue$tpr, type = "l", lwd = 3, col = "#E41A1C",
     xlab = "False Positive Rate (1 - Specificity)",
     ylab = "True Positive Rate (Sensitivity)",
     main = "Receiver Operating Characteristic (ROC) Curves",
     xlim = c(0, 1), ylim = c(0, 1),
     cex.lab = 1.2, cex.main = 1.3)
lines(roc_binspect$fpr, roc_binspect$tpr, lwd = 3, col = "#377EB8")
lines(roc_sparkx$fpr, roc_sparkx$tpr, lwd = 3, col = "#4DAF4A")
lines(roc_seurat$fpr, roc_seurat$tpr, lwd = 3, col = "#984EA3")
abline(0, 1, lty = 2, col = "gray50", lwd = 2)

# Add AUC values to legend
legend("bottomright", 
       legend = c(
         sprintf("MERINGUE (AUC = %.3f)", roc_meringue$auc),
         sprintf("binSpect (AUC = %.3f)", roc_binspect$auc),
         sprintf("SPARK-X (AUC = %.3f)", roc_sparkx$auc),
         sprintf("Seurat (AUC = %.3f)", roc_seurat$auc)
       ),
       col = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3"),
       lwd = 3, cex = 1.0, bty = "n")
par(oldpar)

## ----overlap-analysis, fig.width=9, fig.height=5------------------------------
# Get significant genes for each method
sig_genes <- list(
  MERINGUE = results_meringue$gene[results_meringue$p.adj < 0.05],
  binSpect = results_binspect$gene[results_binspect$p.adj < 0.05],
  SPARKX = results_sparkx$gene[results_sparkx$p.adj < 0.05],
  Seurat = results_seurat$gene[results_seurat$p.adj < 0.05]
)

# Calculate pairwise Jaccard indices
jaccard <- function(a, b) {
  length(intersect(a, b)) / length(union(a, b))
}

methods <- names(sig_genes)
jaccard_matrix <- matrix(0, length(methods), length(methods))
rownames(jaccard_matrix) <- colnames(jaccard_matrix) <- methods

for (i in seq_along(methods)) {
  for (j in seq_along(methods)) {
    jaccard_matrix[i, j] <- jaccard(sig_genes[[i]], sig_genes[[j]])
  }
}

# Visualize overlap
oldpar <- par(mfrow = c(1, 2), mar = c(5, 6, 4, 4))

# Jaccard similarity heatmap
image(jaccard_matrix, axes = FALSE,
      col = colorRampPalette(c("white", "steelblue", "darkblue"))(100),
      main = "Pairwise Jaccard Similarity")
axis(1, at = seq(0, 1, length.out = 4), labels = methods, las = 2, cex.axis = 0.9)
axis(2, at = seq(0, 1, length.out = 4), labels = methods, las = 1, cex.axis = 0.9)

for (i in 1:4) {
  for (j in 1:4) {
    text((j - 1) / 3, (i - 1) / 3, 
         sprintf("%.2f", jaccard_matrix[i, j]),
         cex = 1.0, col = ifelse(jaccard_matrix[i, j] > 0.5, "white", "black"))
  }
}

# Number of significant genes
barplot(sapply(sig_genes, length),
        col = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3"),
        ylab = "Number of Significant Genes",
        main = "Number of SVGs Detected",
        las = 1, border = NA)
abline(h = sum(truth), lty = 2, col = "red", lwd = 2)
legend("topright", legend = "True SVGs (n=50)", lty = 2, col = "red", lwd = 2, bty = "n")
par(oldpar)

## ----custom-simulation--------------------------------------------------------
# Set seed for reproducibility
set.seed(2024)

# Generate custom dataset with specific parameters
sim_data <- simulate_spatial_data(
  n_spots = 400,              # Number of spatial locations
  n_genes = 150,              # Total genes
  n_svg = 30,                 # Number of true SVGs
  grid_type = "hexagonal",    # Spatial arrangement
  pattern_types = c("gradient", "hotspot", "periodic"),  # Pattern types to include
  mean_counts = 150,          # Mean expression level
  dispersion = 2.5            # Negative binomial dispersion
)

cat("Custom Simulation Summary:\n")
cat("  Spots:", ncol(sim_data$counts), "\n")
cat("  Genes:", nrow(sim_data$counts), "\n")
cat("  True SVGs:", sum(sim_data$gene_info$is_svg), "\n")
cat("\nPattern distribution:\n")
print(table(sim_data$gene_info$pattern_type))

## ----parallel-demo------------------------------------------------------------
# Demonstrate parallel processing
# Note: mclapply doesn't work on Windows; falls back to sequential

n_cores <- 2  # Adjust based on your system

t_start <- Sys.time()
results_parallel <- CalSVG_MERINGUE(
  expr_matrix = expr_log,
  spatial_coords = coords,
  n_threads = n_cores,
  verbose = FALSE
)
t_end <- Sys.time()

cat(sprintf("Parallel execution with %d cores: %.2f seconds\n", 
            n_cores, as.numeric(t_end - t_start, units = "secs")))

## ----gene-filtering-----------------------------------------------------------
# Pre-filter genes to improve signal-to-noise and reduce computation

# Strategy 1: Expression threshold
gene_means <- rowMeans(expr_log)
expr_high <- expr_log[gene_means > quantile(gene_means, 0.1), ]
cat("After expression filter:", nrow(expr_high), "genes\n")

# Strategy 2: Variance filter
gene_vars <- apply(expr_high, 1, var)
expr_filtered <- expr_high[gene_vars > quantile(gene_vars, 0.25), ]
cat("After variance filter:", nrow(expr_filtered), "genes\n")

# Strategy 3: Coefficient of variation
cv <- apply(expr_high, 1, sd) / (rowMeans(expr_high) + 0.1)
expr_cv <- expr_high[cv > quantile(cv, 0.25), ]
cat("After CV filter:", nrow(expr_cv), "genes\n")

## ----session-info-------------------------------------------------------------
sessionInfo()