# Set root directory
here::i_am("scripts/DE_model_comparison.Rmd")
# Load libraries
suppressPackageStartupMessages({
library(DESeq2)
library(tidyverse)
library(dplyr)
library(UpSetR)
library(here)
library(ggplot2)
library(variancePartition)
library(BiocParallel)
library(limma)
})
# Load transcript abundance matrix
abundance_matrix <- read.table(here::here("data", "DE", "abundance_matrix.txt"), header = TRUE, row.names = 1, check.names = FALSE)
head(abundance_matrix)
## 39_Melops1040 38_Melops1037 37_Melops1036 36_Melops1035
## TRINITY_GG_716_c838_g1 0 2 0 0
## TRINITY_GG_1114_c422_g3 0 0 0 0
## TRINITY_GG_1463_c526_g6 0 7 0 3
## TRINITY_GG_1586_c745_g1 0 0 0 0
## TRINITY_GG_560_c228_g1 1 2 1 0
## TRINITY_GG_606_c418_g2 1 1 1 1
## 35_Melops1033 34_Melops1025 33_Melops1024 32_Melops1023
## TRINITY_GG_716_c838_g1 0 0 0 0
## TRINITY_GG_1114_c422_g3 0 0 0 0
## TRINITY_GG_1463_c526_g6 0 0 1 0
## TRINITY_GG_1586_c745_g1 0 0 0 0
## TRINITY_GG_560_c228_g1 0 0 0 0
## TRINITY_GG_606_c418_g2 3 6 5 0
## 31_Melops1022 30_Melops1021 29_Melops1211 28_Melops1210
## TRINITY_GG_716_c838_g1 0 0 0 0
## TRINITY_GG_1114_c422_g3 0 0 0 0
## TRINITY_GG_1463_c526_g6 2 1 2 0
## TRINITY_GG_1586_c745_g1 0 0 0 3
## TRINITY_GG_560_c228_g1 0 0 4 0
## TRINITY_GG_606_c418_g2 2 2 0 1
## 27_Melops1209 26_Melops1206 25_Melops1205 24_Melops1082
## TRINITY_GG_716_c838_g1 0 0 0 1
## TRINITY_GG_1114_c422_g3 0 0 0 0
## TRINITY_GG_1463_c526_g6 0 0 0 0
## TRINITY_GG_1586_c745_g1 0 0 0 0
## TRINITY_GG_560_c228_g1 0 0 0 0
## TRINITY_GG_606_c418_g2 0 3 5 0
## 23_Melops1065 22_Melops1061 21_Melops1058 20_Melops1046
## TRINITY_GG_716_c838_g1 0 0 0 1
## TRINITY_GG_1114_c422_g3 0 0 0 0
## TRINITY_GG_1463_c526_g6 0 0 0 0
## TRINITY_GG_1586_c745_g1 0 0 0 0
## TRINITY_GG_560_c228_g1 0 0 0 0
## TRINITY_GG_606_c418_g2 4 2 1 1
## 19_Melops1131 18_Melops1075 17_Melops1072 16_Melops1017
## TRINITY_GG_716_c838_g1 0 0 0 0
## TRINITY_GG_1114_c422_g3 0 0 0 0
## TRINITY_GG_1463_c526_g6 0 0 0 0
## TRINITY_GG_1586_c745_g1 0 0 0 0
## TRINITY_GG_560_c228_g1 1 1 0 1
## TRINITY_GG_606_c418_g2 2 1 0 0
## 15_Melops1014 14_Melops1202 13_Melops1200 12_Melops1197
## TRINITY_GG_716_c838_g1 0 0 0 0
## TRINITY_GG_1114_c422_g3 0 0 0 1
## TRINITY_GG_1463_c526_g6 0 0 0 2
## TRINITY_GG_1586_c745_g1 0 0 0 0
## TRINITY_GG_560_c228_g1 2 0 0 2
## TRINITY_GG_606_c418_g2 2 3 4 0
## 11_Melops1194 10_Melops1192 09_Melops1158 08_Melops1126
## TRINITY_GG_716_c838_g1 0 0 0 1
## TRINITY_GG_1114_c422_g3 0 0 0 0
## TRINITY_GG_1463_c526_g6 1 0 2 1
## TRINITY_GG_1586_c745_g1 0 0 3 0
## TRINITY_GG_560_c228_g1 0 1 4 0
## TRINITY_GG_606_c418_g2 3 5 5 2
## 07_Melops1015 06_Melops1010 05_Melops1259 04_Melops1239
## TRINITY_GG_716_c838_g1 0 0 0 0
## TRINITY_GG_1114_c422_g3 0 0 0 0
## TRINITY_GG_1463_c526_g6 1 0 0 0
## TRINITY_GG_1586_c745_g1 0 0 0 0
## TRINITY_GG_560_c228_g1 0 0 0 0
## TRINITY_GG_606_c418_g2 0 4 0 4
## 03_Melops1218 02_Melops1195 01_Melops1188
## TRINITY_GG_716_c838_g1 0 0 0
## TRINITY_GG_1114_c422_g3 0 0 0
## TRINITY_GG_1463_c526_g6 0 0 0
## TRINITY_GG_1586_c745_g1 0 0 0
## TRINITY_GG_560_c228_g1 0 0 0
## TRINITY_GG_606_c418_g2 3 0 2
# Load sample metadata
sample_info <- read.table(here::here("data", "DE", "samples.txt"), header = FALSE, col.names = c("condition", "sample"))
# Parse temperature and origin from condition
sample_info <- sample_info %>%
mutate(
origin = substr(condition, 1, 1),
temperature = substr(condition, 2, nchar(condition))
) %>%
mutate(
origin = recode(origin, "W" = "West", "H" = "Hybrid", "S" = "South"),
temperature = factor(temperature, levels = c("12", "15", "18")),
origin = factor(origin)
)
sample_info
## condition sample origin temperature
## 1 W18 39_Melops1040 West 18
## 2 W18 38_Melops1037 West 18
## 3 W18 37_Melops1036 West 18
## 4 W18 36_Melops1035 West 18
## 5 W18 35_Melops1033 West 18
## 6 W15 34_Melops1025 West 15
## 7 W15 33_Melops1024 West 15
## 8 W15 32_Melops1023 West 15
## 9 W15 31_Melops1022 West 15
## 10 W15 30_Melops1021 West 15
## 11 W12 29_Melops1211 West 12
## 12 W12 28_Melops1210 West 12
## 13 W12 27_Melops1209 West 12
## 14 W12 26_Melops1206 West 12
## 15 W12 25_Melops1205 West 12
## 16 H18 24_Melops1082 Hybrid 18
## 17 H18 23_Melops1065 Hybrid 18
## 18 H18 22_Melops1061 Hybrid 18
## 19 H18 21_Melops1058 Hybrid 18
## 20 H18 20_Melops1046 Hybrid 18
## 21 H15 19_Melops1131 Hybrid 15
## 22 H15 18_Melops1075 Hybrid 15
## 23 H15 17_Melops1072 Hybrid 15
## 24 H15 16_Melops1017 Hybrid 15
## 25 H15 15_Melops1014 Hybrid 15
## 26 H12 14_Melops1202 Hybrid 12
## 27 H12 13_Melops1200 Hybrid 12
## 28 H12 12_Melops1197 Hybrid 12
## 29 H12 11_Melops1194 Hybrid 12
## 30 H12 10_Melops1192 Hybrid 12
## 31 S18 09_Melops1158 South 18
## 32 S18 08_Melops1126 South 18
## 33 S15 07_Melops1015 South 15
## 34 S15 06_Melops1010 South 15
## 35 S12 05_Melops1259 South 12
## 36 S12 04_Melops1239 South 12
## 37 S12 03_Melops1218 South 12
## 38 S12 02_Melops1195 South 12
## 39 S12 01_Melops1188 South 12
# Match column names
colnames(abundance_matrix) <- sample_info$sample
# Ensure non-negative integer count
abundance_matrix <- round(pmax(abundance_matrix, 0))
# Keep transcripts with total count ≥ 10 across all samples
keep_transcripts <- rowSums(abundance_matrix) >= 10
abundance_matrix <- abundance_matrix[keep_transcripts, ]
# Define condition model
dds_condition <- DESeqDataSetFromMatrix(countData = abundance_matrix, colData = sample_info, design = ~ condition)
# Define additive model
dds_additive <- DESeqDataSetFromMatrix(countData = abundance_matrix, colData = sample_info, design = ~ temperature + origin)
# Define interaction model
dds_interaction <- DESeqDataSetFromMatrix(countData = abundance_matrix, colData = sample_info, design = ~ temperature * origin)
dds_interaction <- DESeq(dds_interaction)
dds_lrt_inter_vs_add <- DESeq(dds_interaction, test = "LRT", reduced = ~ temperature + origin)
res_inter_vs_add <- results(dds_lrt_inter_vs_add, alpha = 0.05)
summary(res_inter_vs_add)
##
## out of 168455 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 162, 0.096%
## LFC < 0 (down) : 579, 0.34%
## outliers [1] : 116, 0.069%
## low counts [2] : 140332, 83%
## (mean count < 18)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Filter transcripts with adjusted p-value < 0.05 from the LRT
interaction_transcripts <- rownames(res_inter_vs_add)[which(res_inter_vs_add$padj < 0.05)]
# Save to file for use in additive model DE script (Tier 2 overlap)
write.csv(
data.frame(transcript_id = interaction_transcripts),
here::here("results", "DE", "interaction_effects", "significant_interaction_transcripts.csv"),
row.names = FALSE
)
# Print number of transcripts
cat("Number of transcripts with significant interaction effects:", length(interaction_transcripts), "\n")
## Number of transcripts with significant interaction effects: 741
dds_condition <- DESeq(dds_condition)
dds_lrt_cond_vs_null <- DESeq(dds_condition, test = "LRT", reduced = ~ 1)
res_cond_vs_null <- results(dds_lrt_cond_vs_null, alpha = 0.05)
summary(res_cond_vs_null)
##
## out of 168455 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 9449, 5.6%
## LFC < 0 (down) : 8567, 5.1%
## outliers [1] : 116, 0.069%
## low counts [2] : 22862, 14%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
dds_additive <- DESeq(dds_additive)
dds_lrt_add_vs_null <- DESeq(dds_additive, test = "LRT", reduced = ~ 1)
res_add_vs_null <- results(dds_lrt_add_vs_null, alpha = 0.05)
summary(res_add_vs_null)
##
## out of 168455 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 10969, 6.5%
## LFC < 0 (down) : 11903, 7.1%
## outliers [1] : 322, 0.19%
## low counts [2] : 88071, 52%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
dds_lrt_inter_vs_null <- DESeq(dds_interaction, test = "LRT", reduced = ~ 1)
res_inter_vs_null <- results(dds_lrt_inter_vs_null, alpha = 0.05)
summary(res_inter_vs_null)
##
## out of 168455 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 8829, 5.2%
## LFC < 0 (down) : 9187, 5.5%
## outliers [1] : 116, 0.069%
## low counts [2] : 22862, 14%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Create list of significant transcripts from each comparison
listInput <- list(
Inter_vs_Add = rownames(res_inter_vs_add)[which(res_inter_vs_add$padj < 0.05)],
Cond_vs_Null = rownames(res_cond_vs_null)[which(res_cond_vs_null$padj < 0.05)],
Add_vs_Null = rownames(res_add_vs_null)[which(res_add_vs_null$padj < 0.05)],
Inter_vs_Null = rownames(res_inter_vs_null)[which(res_inter_vs_null$padj < 0.05)]
)
# UpSet plot
upset(fromList(listInput), order.by = "freq", main.bar.color = "steelblue")
# Intersection table
all_transcripts <- unique(unlist(listInput))
presence_df <- tibble(
transcript = all_transcripts,
Inter_vs_Add = all_transcripts %in% listInput$Inter_vs_Add,
Cond_vs_Null = all_transcripts %in% listInput$Cond_vs_Null,
Add_vs_Null = all_transcripts %in% listInput$Add_vs_Null,
Inter_vs_Null = all_transcripts %in% listInput$Inter_vs_Null
)
summary_table <- presence_df %>%
group_by(Inter_vs_Add, Cond_vs_Null, Add_vs_Null, Inter_vs_Null) %>%
summarise(TranscriptCount = n(), .groups = "drop") %>%
arrange(desc(TranscriptCount))
print(summary_table)
## # A tibble: 6 × 5
## Inter_vs_Add Cond_vs_Null Add_vs_Null Inter_vs_Null TranscriptCount
## <lgl> <lgl> <lgl> <lgl> <int>
## 1 FALSE TRUE TRUE TRUE 16317
## 2 FALSE FALSE TRUE FALSE 5883
## 3 FALSE TRUE FALSE TRUE 962
## 4 TRUE TRUE TRUE TRUE 672
## 5 TRUE TRUE FALSE TRUE 65
## 6 TRUE FALSE FALSE FALSE 4
# Totals for each model
totals_table <- tibble(
Model = c("Interaction vs Additive", "Condition vs Null", "Additive vs Null", "Interaction vs Null"),
SignificantTranscripts = c(
length(listInput$Inter_vs_Add),
length(listInput$Cond_vs_Null),
length(listInput$Add_vs_Null),
length(listInput$Inter_vs_Null)
)
)
print(totals_table)
## # A tibble: 4 × 2
## Model SignificantTranscripts
## <chr> <int>
## 1 Interaction vs Additive 741
## 2 Condition vs Null 18016
## 3 Additive vs Null 22872
## 4 Interaction vs Null 18016
# Number of transcripts before filtering
n_before <- nrow(expr_matrix)
# Number of transcripts after filtering
n_after <- nrow(expr_filtered)
# Number of transcripts removed
n_removed <- n_before - n_after
cat("Transcripts before filtering:", n_before, "\n")
## Transcripts before filtering: 168455
cat("Transcripts after filtering:", n_after, "\n")
## Transcripts after filtering: 22177
cat("Transcripts removed:", n_removed, "\n")
## Transcripts removed: 146278
# Run variance partitioning
form_condition <- ~ (1|condition)
form_additive <- ~ (1|origin) + (1|temperature)
form_interaction <- ~ (1|origin) + (1|temperature) + (1|origin:temperature)
varPart_condition <- fitExtractVarPartModel(expr_filtered, form_condition, sample_info)
varPart_additive <- fitExtractVarPartModel(expr_filtered, form_additive, metadata)
varPart_interaction <- fitExtractVarPartModel(expr_filtered, form_interaction, metadata)
# Plot 1: Condition model
plotVarPart(varPart_condition) +
theme_minimal(base_size = 14) +
ggtitle("Variance Partitioning - Condition Model")
# Plot 2: Additive model
plotVarPart(varPart_additive) +
theme_minimal(base_size = 14) +
ggtitle("Variance Partitioning - Additive Model")
# Plot 3: Interaction model
plotVarPart(varPart_interaction) +
theme_minimal(base_size = 14) +
ggtitle("Variance Partitioning - Interaction Model")
# Convert to data frames and add model labels
vp_add <- as.data.frame(varPart_additive) %>% mutate(Model = "Additive")
vp_int <- as.data.frame(varPart_interaction) %>% mutate(Model = "Interaction")
vp_cond <- as.data.frame(varPart_condition) %>% mutate(Model = "Condition")
# Combine and reshape
vp_combined <- bind_rows(vp_add, vp_int, vp_cond) %>%
pivot_longer(cols = -Model, names_to = "Factor", values_to = "Variance") %>%
filter(is.finite(Variance))
# Plot 4a: Boxplot comparison
ggplot(vp_combined, aes(x = Factor, y = Variance, fill = Model)) +
geom_boxplot(outlier.size = 0.5) +
theme_minimal(base_size = 14) +
labs(title = "Variance Explained by Factor: All Models")
# Plot 4b: Violin plot comparison
ggplot(vp_combined, aes(x = Factor, y = Variance, fill = Model)) +
geom_violin(trim = FALSE, scale = "width") +
theme_minimal(base_size = 14) +
labs(title = "Variance Distribution by Factor: All Models")
# Compute Euclidean distance matrix
dist_matrix <- dist(t(assay(vsd))) # transpose so samples are rows
# Perform classical MDS
mds_coords <- cmdscale(dist_matrix, k = 2)
mds_df <- as.data.frame(mds_coords)
colnames(mds_df) <- c("MDS1", "MDS2")
mds_df$Sample <- colnames(vsd)
mds_df <- cbind(mds_df, as.data.frame(colData(vsd)))
# Plot MDS with color by Temperature and shape by Origin
ggplot(mds_df, aes(x = MDS1, y = MDS2, color = factor(temperature), shape = factor(origin))) +
geom_point(size = 3) +
labs(
title = "MDS Plot of Trinity-derived Counts",
color = "Temperature",
shape = "Origin"
) +
theme_minimal(base_size = 14)
# Save to file
ggsave(
filename = here::here("results", "sample_clustering", "MDS", "MDS_plot.png"),
width = 8, height = 6, dpi = 300
)
This script compares condition, additive, and interaction models using variance partitioning and LRTs to evaluate differential expression patterns in Symphodus melops. The condition model explained the highest proportion of variance, reflecting its flexibility in treating each origin-temperature combination as a distinct group. However, this structure does not share information across conditions and is prone to overfitting, especially with limited replication. In contrast, the additive model explained substantial variance, particularly for temperature, and identified the largest number of significantly differentially expressed transcripts. This is supported by the MDS plot, which shows that samples primarily cluster by temperature, with less separation by origin — reinforcing the additive model’s ability to capture the dominant biological signal. LRT comparing the interaction model to the additive model revealed that only 741 transcripts (~0.4%) showed significantly improved fit when including the interaction term. These transcripts were considered candidates for origin-dependent temperature responses and were extracted for downstream analysis. Based on these results, the additive model was selected for global differential expression analysis.
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Oslo
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] variancePartition_1.38.1 BiocParallel_1.42.2
## [3] limma_3.64.3 here_1.0.2
## [5] UpSetR_1.4.0 lubridate_1.9.4
## [7] forcats_1.0.1 stringr_1.5.2
## [9] dplyr_1.1.4 purrr_1.1.0
## [11] readr_2.1.5 tidyr_1.3.1
## [13] tibble_3.3.0 ggplot2_4.0.0
## [15] tidyverse_2.0.0 DESeq2_1.48.2
## [17] SummarizedExperiment_1.38.1 Biobase_2.68.0
## [19] MatrixGenerics_1.20.0 matrixStats_1.5.0
## [21] GenomicRanges_1.60.0 GenomeInfoDb_1.44.3
## [23] IRanges_2.42.0 S4Vectors_0.46.0
## [25] BiocGenerics_0.54.0 generics_0.1.4
##
## loaded via a namespace (and not attached):
## [1] Rdpack_2.6.4 bitops_1.0-9 gridExtra_2.3
## [4] rlang_1.1.6 magrittr_2.0.4 compiler_4.5.1
## [7] systemfonts_1.3.1 reshape2_1.4.4 vctrs_0.6.5
## [10] pkgconfig_2.0.3 crayon_1.5.3 fastmap_1.2.0
## [13] backports_1.5.0 XVector_0.48.0 labeling_0.4.3
## [16] utf8_1.2.6 caTools_1.18.3 rmarkdown_2.30
## [19] tzdb_0.5.0 UCSC.utils_1.4.0 nloptr_2.2.1
## [22] ragg_1.5.0 xfun_0.53 cachem_1.1.0
## [25] jsonlite_2.0.0 EnvStats_3.1.0 remaCor_0.0.20
## [28] DelayedArray_0.34.1 broom_1.0.10 parallel_4.5.1
## [31] R6_2.6.1 bslib_0.9.0 stringi_1.8.7
## [34] RColorBrewer_1.1-3 boot_1.3-28 jquerylib_0.1.4
## [37] numDeriv_2016.8-1.1 Rcpp_1.1.0 iterators_1.0.14
## [40] knitr_1.50 Matrix_1.4-0 splines_4.5.1
## [43] timechange_0.3.0 tidyselect_1.2.1 abind_1.4-8
## [46] yaml_2.3.10 gplots_3.2.0 codetools_0.2-18
## [49] lattice_0.20-45 lmerTest_3.1-3 plyr_1.8.9
## [52] withr_3.0.2 S7_0.2.0 evaluate_1.0.5
## [55] pillar_1.11.1 KernSmooth_2.23-20 reformulas_0.4.1
## [58] rprojroot_2.1.1 hms_1.1.3 scales_1.4.0
## [61] aod_1.3.3 minqa_1.2.8 gtools_3.9.5
## [64] RhpcBLASctl_0.23-42 glue_1.8.0 tools_4.5.1
## [67] fANCOVA_0.6-1 lme4_1.1-37 locfit_1.5-9.12
## [70] mvtnorm_1.3-3 grid_4.5.1 rbibutils_2.3
## [73] nlme_3.1-155 GenomeInfoDbData_1.2.14 cli_3.6.5
## [76] textshaping_1.0.3 S4Arrays_1.8.1 corpcor_1.6.10
## [79] gtable_0.3.6 sass_0.4.10 digest_0.6.37
## [82] SparseArray_1.8.1 pbkrtest_0.5.5 farver_2.1.2
## [85] htmltools_0.5.8.1 lifecycle_1.0.4 httr_1.4.7
## [88] statmod_1.5.0 MASS_7.3-55