Preamble

Dependencies:

suppressPackageStartupMessages(library(plantmix))
suppressPackageStartupMessages(library(lme4))

Overview

In this vignette, varietal mixtures will be assembled from a panel of 200 genotypes. Only 1000 mixtures will be observed, among all 1.99^{4} possible ones, i.e., approximately 5%. This incomplete design will be optimized so that it is balanced: all genotype will be observed in the same number of mixtures. Phenotypes will then be simulated organized into a field trial in a randomized complete block design, according to all six GMA-SMA models that can be conceived: 1, 2, 2’, 2’‘, 3 and 3’. See the article by Forst et al (2019), especially for the first three models, as well as the introductory vignette.

Simulate data

set.seed(12345)

Simulate the genotypes

Set the dimensions

nbGenos <- 200
levGenos <- sprintf(
  fmt = paste0("g%0", floor(log10(nbGenos)) + 1, "i"),
  1:nbGenos
)

nbSnps <- 1000
levSnps <- sprintf(
  fmt = paste0("s%0", floor(log10(nbSnps)) + 1, "i"),
  1:nbSnps
)

SNP genotypes

nb_pops <- 10
weak_div_pops <- diag(nb_pops)
weak_div_pops[upper.tri(weak_div_pops)] <- 0.9
weak_div_pops[lower.tri(weak_div_pops)] <- weak_div_pops[upper.tri(weak_div_pops)]
tmp <- rep(nbGenos / nb_pops, nb_pops - 1)
tmp <- c(tmp, nbGenos - sum(tmp))
snpGenos <- simulGenosDoseStruct(
  nb_genos = tmp,
  nb_snps = nbSnps,
  div_pops = weak_div_pops,
  geno_IDs = levGenos,
  snp_IDs = levSnps
)
dim(snpGenos)
## [1]  200 1000
snpGenos[1:3, 1:4]
##      s0001 s0002 s0003 s0004
## g001     1     1     1     1
## g002     1     0     1     1
## g003     1     1     2     1

Additive genetic relationships

For simplicity, the first estimator of VanRaden (2008) is used, that assumes linkage equilibrium and Hardy-Weinberg equilibrium.

GRM <- estimGRM(snpGenos)
GRM <- as.matrix(Matrix::nearPD(GRM)$mat)

image(Matrix(GRM), main = "GRM")

hist(diag(GRM), main = "GRM")

hist(GRM[upper.tri(GRM)], main = "GRM")

Simulate the phenotypes

Incomplete, yet balanced design

nbMixes <- 1000
design <- getDesignBinaryVarMix(levGenos, nbMixes = nbMixes)
tmp <- getMixturesPerGeno(getMixtureList(design$combs))
table(sapply(tmp, length)) # each genotype is in the same nb of mixtures -> balanced design
## 
##  10 
## 200

Field trial

monoStands <- paste(levGenos, levGenos, sep = "_")

pairs <- design$combs
mixStands <- paste(pairs[, 1], pairs[, 2], sep = "_")
IDs <- c(monoStands, mixStands)

nbBlocks <- 3
blocks <- LETTERS[1:nbBlocks]

dat <- do.call(rbind, lapply(blocks, function(block) {
  data.frame(
    ID = IDs,
    block = block,
    stringsAsFactors = TRUE
  )
}))

Design matrices

listContr <- list(block = "contr.sum")
X <- model.matrix(~ 1 + block, data = dat, contrasts = listContr)

allZ <- list()
allZ$GMA <- mkZGMA(dat, col = "ID", sep = "_")
system.time(
  allZ <- append(
    allZ,
    mkAllZSMA(dat, col = "ID", sep = "_", verbose = 1)
  )
)
## [1] "model 2'"
## [1] "model 2"
## [1] "model 2''"
## [1] "model 3"
## [1] "model 3'"
##    user  system elapsed 
##   9.983   0.686  10.670
sapply(allZ, dim)
##       GMA SMA_mod2p SMA_mod2 SMA_mod2pp_ij SMA_mod2pp_ii SMA_mod3 SMA_mod3p_ij
## [1,] 3600      3600     3600          3600          3600     3600         3600
## [2,]  200      1000     1200          1000           200     1200         1000
##      SMA_mod3p_ii
## [1,]         3600
## [2,]          200

Parameters

Several packages will be tested below, most notably lme4 and TMB. However, lme4 does not natively take a GRM as input. Hence the GRM will be replaced by the identity matrix to ensure a fair comparison between packages.

truth <- list(
  "intercept" = 100,
  "var_GMA" = 10,
  "var_SMA" = 2,
  "var_SMA_ij" = 1.5,
  "var_SMA_ii" = 0.8,
  "var_error" = 1
)
set.seed(1234)
truth[["blockEffs"]] <- sample(x = c(-1, 1), size = nbBlocks - 1, replace = TRUE) *
  rnorm(n = nbBlocks - 1, mean = 3, sd = 2)
GRM <- diag(nbGenos)
truth[["GMAs"]] <- MASS::mvrnorm(
  n = 1, mu = rep(0, nbGenos),
  Sigma = truth$var_GMA * GRM
)
truth[["SMAs"]] <- rnorm(n = length(IDs), mean = 0, sd = sqrt(truth$var_SMA))
truth[["SMAs_ij"]] <- rnorm(n = length(mixStands), mean = 0, sd = sqrt(truth$var_SMA_ij))
truth[["SMAs_ii"]] <- rnorm(n = length(monoStands), mean = 0, sd = sqrt(truth$var_SMA_ii))
truth[["errors"]] <- rnorm(n = nrow(dat), mean = 0, sd = sqrt(truth$var_error))

Phenotypes

y1 <- X %*% c(truth$intercept, truth$blockEffs) +
  allZ$GMA %*% truth$GMAs +
  truth$errors
dat$pheno1 <- y1[, 1]

y2 <- X %*% c(truth$intercept, truth$blockEffs) +
  allZ$GMA %*% truth$GMAs +
  allZ$SMA_mod2 %*% truth$SMAs +
  truth$errors
dat$pheno2 <- y2[, 1]

y3 <- X %*% c(truth$intercept, truth$blockEffs) +
  allZ$GMA %*% truth$GMAs +
  allZ$SMA_mod3 %*% truth$SMAs +
  truth$errors
dat$pheno3 <- y3[, 1]

y2p <- X %*% c(truth$intercept, truth$blockEffs) +
  allZ$GMA %*% truth$GMAs +
  allZ$SMA_mod2p %*% truth$SMAs_ij +
  truth$errors
dat$pheno2p <- y2p[, 1]

y2pp <- X %*% c(truth$intercept, truth$blockEffs) +
  allZ$GMA %*% truth$GMAs +
  allZ$SMA_mod2pp_ij %*% truth$SMAs_ij +
  allZ$SMA_mod2pp_ii %*% truth$SMAs_ii +
  truth$errors
dat$pheno2pp <- y2pp[, 1]

y3p <- X %*% c(truth$intercept, truth$blockEffs) +
  allZ$GMA %*% truth$GMAs +
  allZ$SMA_mod3p_ij %*% truth$SMAs_ij +
  allZ$SMA_mod3p_ii %*% truth$SMAs_ii +
  truth$errors
dat$pheno3p <- y3p[, 1]

Infer the parameters

Function fitGMASMA allows to fit the models with various packages. In this vignette, only lme4 and TMB will be used and, as you can see below, both return the same estimates. (To save time, MM4LMM is commented.)

pkgs <- c("lme4", "TMB") # , "MM4LMM")
if ("MM4LMM" %in% pkgs) {
  suppressPackageStartupMessages(library(MM4LMM))
}

runAllPkgs <- function(pkgs, form, dat, listZ, listVCov, listContr, ...) {
  ## mcMap(function(pkg) {
  Map(function(pkg) {
    print(paste0("fit with ", pkg, "..."))
    fitGMASMA(form, dat, listZ, pkg, listVCov, listContr)
    st <- system.time(
      try(fit <- fitGMASMA(form, dat, listZ, pkg, listVCov, listContr, REML = TRUE, ...))
    )
    print(st)
    fit
  }, pkgs) # , mc.cores = nbCores)
}

Model 1

Fit

form <- pheno1 ~ 1 + block
listZ <- list("GMA" = allZ$GMA)
listVCov <- list("GMA" = GRM)
listContr <- list(block = "contr.sum")
fits1 <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
##    user  system elapsed 
##   0.081   0.125   0.065 
## [1] "fit with TMB..."
##    user  system elapsed 
##   0.350   0.382   0.301

Check

print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits1$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_error")]),
  estim = tmp[c("GMA", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim       nBE
## var_GMA      10 10.5474529  5.474529
## var_error     1  0.9599666 -4.003335

print("TMB:")
## [1] "TMB:"
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_error")]),
  estim = do.call(c, fits1$TMB$report[c("var_GMA", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim       nBE
## var_GMA      10 10.5474516  5.474516
## var_error     1  0.9599667 -4.003335

if ("MM4LMM" %in% names(fits1)) {
  print(fits1$MM4LMM$Sigma2)
}

Model 2

Fit

form <- pheno2 ~ 1 + block
listZ <- list(
  "GMA" = allZ$GMA,
  "SMA" = allZ$SMA_mod2
)
listVCov <- list(
  "GMA" = GRM,
  "SMA" = diag(ncol(listZ$SMA))
)
listContr <- list(block = "contr.sum")
fits2 <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
##    user  system elapsed 
##   0.399   0.734   0.315 
## [1] "fit with TMB..."
##    user  system elapsed 
##   1.339   1.405   1.237

Check

print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits2$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
  estim = tmp[c("GMA", "SMA", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim        nBE
## var_GMA      10 10.4978544  4.9785441
## var_SMA       2  2.0197144  0.9857179
## var_error     1  0.9453828 -5.4617170

print("TMB:")
## [1] "TMB:"
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
  estim = do.call(c, fits2$TMB$report[c("var_GMA", "var_SMA1", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim        nBE
## var_GMA      10 10.4979307  4.9793071
## var_SMA       2  2.0197059  0.9852969
## var_error     1  0.9453837 -5.4616269

if ("MM4LMM" %in% names(fits2)) {
  print(fits2$MM4LMM$Sigma2)
}

Model 3

Fit

form <- pheno3 ~ 1 + block
listZ <- list(
  "GMA" = allZ$GMA,
  "SMA" = allZ$SMA_mod3
)
listVCov <- list(
  "GMA" = GRM,
  "SMA" = diag(ncol(listZ$SMA))
)
listContr <- list(block = "contr.sum")
fits3 <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
##    user  system elapsed 
##   0.538   0.897   0.392 
## [1] "fit with TMB..."
##    user  system elapsed 
##   1.665   1.985   1.477

Check

print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits3$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
  estim = tmp[c("GMA", "SMA", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim       nBE
## var_GMA      10 10.2611009  2.611009
## var_SMA       2  2.1054392  5.271960
## var_error     1  0.9469001 -5.309988

print("TMB:")
## [1] "TMB:"
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
  estim = do.call(c, fits3$TMB$report[c("var_GMA", "var_SMA1", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim       nBE
## var_GMA      10 10.2611011  2.611011
## var_SMA       2  2.1054394  5.271968
## var_error     1  0.9469001 -5.309992

if ("MM4LMM" %in% names(fits3)) {
  print(fits3$MM4LMM$Sigma2)
}

Model 2’

Fit

form <- pheno2p ~ 1 + block
listZ <- list(
  "GMA" = allZ$GMA,
  "SMA" = allZ$SMA_mod2p
)
listVCov <- list(
  "GMA" = GRM,
  "SMA" = diag(ncol(listZ$SMA))
)
listContr <- list(block = "contr.sum")
fits2p <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
##    user  system elapsed 
##   0.332   0.575   0.254 
## [1] "fit with TMB..."
##    user  system elapsed 
##   1.173   1.154   1.055

Check

print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits2p$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
  estim = tmp[c("GMA", "SMA", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim        nBE
## var_GMA      10 10.3596086   3.596086
## var_SMA       2  1.6026602 -19.866989
## var_error     1  0.9456708  -5.432918

print("TMB:")
## [1] "TMB:"
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
  estim = do.call(c, fits2p$TMB$report[c("var_GMA", "var_SMA1", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim        nBE
## var_GMA      10 10.3596074   3.596074
## var_SMA       2  1.6026609 -19.866954
## var_error     1  0.9456708  -5.432919

if ("MM4LMM" %in% names(fits2p)) {
  print(fits2p$MM4LMM$Sigma2)
}

Model 2’’

Fit

form <- pheno2pp ~ 1 + block
listZ <- list(
  "GMA" = allZ$GMA,
  "SMA_ij" = allZ$SMA_mod2pp_ij,
  "SMA_ii" = allZ$SMA_mod2pp_ii
)
listVCov <- list(
  "GMA" = GRM,
  "SMA_ij" = diag(ncol(listZ$SMA_ij)),
  "SMA_ii" = diag(ncol(listZ$SMA_ii))
)
listContr <- list(block = "contr.sum")
fits2pp <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
##    user  system elapsed 
##   0.594   1.076   0.456 
## [1] "fit with TMB..."
##    user  system elapsed 
##   1.334   1.403   1.232

Check

print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits2pp$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA_ij", "var_SMA_ii", "var_error")]),
  estim = tmp[c("GMA", "SMA_ij", "SMA_ii", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##            truth      estim         nBE
## var_GMA     10.0 10.0083128  0.08312812
## var_SMA_ij   1.5  1.6172868  7.81911753
## var_SMA_ii   0.8  0.8345907  4.32383996
## var_error    1.0  0.9453832 -5.46167587

print("TMB:")
## [1] "TMB:"
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA_ij", "var_SMA_ii", "var_error")]),
  estim = do.call(c, fits2pp$TMB$report[c("var_GMA", "var_SMA1", "var_SMA2", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##            truth      estim         nBE
## var_GMA     10.0 10.0082750  0.08274965
## var_SMA_ij   1.5  1.6172864  7.81909464
## var_SMA_ii   0.8  0.8345879  4.32348766
## var_error    1.0  0.9453837 -5.46163333

if ("MM4LMM" %in% names(fits2pp)) {
  print(fits2pp$MM4LMM$Sigma2)
}

Model 3’

Fit

form <- pheno3p ~ 1 + block
listZ <- list(
  "GMA" = allZ$GMA,
  "SMA_ij" = allZ$SMA_mod3p_ij,
  "SMA_ii" = allZ$SMA_mod3p_ii
)
listVCov <- list(
  "GMA" = GRM,
  "SMA_ij" = diag(ncol(listZ$SMA_ij)),
  "SMA_ii" = diag(ncol(listZ$SMA_ii))
)
listContr <- list(block = "contr.sum")
fits3p <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
##    user  system elapsed 
##   1.031   1.658   0.708 
## [1] "fit with TMB..."
##    user  system elapsed 
##   1.674   1.797   1.420

Check

print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits3p$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA_ij", "var_SMA_ii", "var_error")]),
  estim = tmp[c("GMA", "SMA_ij", "SMA_ii", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##            truth      estim        nBE
## var_GMA     10.0 10.0390280  0.3902804
## var_SMA_ij   1.5  1.6726172 11.5078114
## var_SMA_ii   0.8  0.9481630 18.5203751
## var_error    1.0  0.9440191 -5.5980891

print("TMB:")
## [1] "TMB:"
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA_ij", "var_SMA_ii", "var_error")]),
  estim = do.call(c, fits3p$TMB$report[c("var_GMA", "var_SMA1", "var_SMA2", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##            truth      estim        nBE
## var_GMA     10.0 10.0390775  0.3907746
## var_SMA_ij   1.5  1.6726225 11.5081642
## var_SMA_ii   0.8  0.9481763 18.5220427
## var_error    1.0  0.9440195 -5.5980522

if ("MM4LMM" %in% names(fits3p)) {
  print(fits3p$MM4LMM$Sigma2)
}

Appendix

t1 <- proc.time()
t1 - t0
##    user  system elapsed 
##  37.260  34.380  34.265
print(sessionInfo(), locale = FALSE)
## R version 4.6.0 (2026-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] plantmix_1.0.2 lme4_2.0-1     Matrix_1.7-5   ggplot2_4.0.3  knitr_1.51    
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     lattice_0.22-9     digest_0.6.39     
##  [5] magrittr_2.0.5     evaluate_1.0.5     grid_4.6.0         RColorBrewer_1.1-3
##  [9] fastmap_1.2.0      jsonlite_2.0.0     scales_1.4.0       jquerylib_0.1.4   
## [13] reformulas_0.4.4   Rdpack_2.6.6       cli_3.6.6          rlang_1.2.0       
## [17] rbibutils_2.4.1    splines_4.6.0      withr_3.0.2        cachem_1.1.0      
## [21] yaml_2.3.12        otel_0.2.0         tools_4.6.0        nloptr_2.2.1      
## [25] minqa_1.2.8        dplyr_1.2.1        boot_1.3-32        buildtools_1.0.0  
## [29] vctrs_0.7.3        R6_2.6.1           lifecycle_1.0.5    MASS_7.3-65       
## [33] pkgconfig_2.0.3    pillar_1.11.1      bslib_0.11.0       gtable_0.3.6      
## [37] glue_1.8.1         Rcpp_1.1.1-1.1     xfun_0.58          tibble_3.3.1      
## [41] tidyselect_1.2.1   sys_3.4.3          farver_2.1.2       htmltools_0.5.9   
## [45] nlme_3.1-169       igraph_2.3.2       rmarkdown_2.31     maketools_1.3.2   
## [49] TMB_1.9.21         compiler_4.6.0     S7_0.2.2

2) Example of GMA-SMA models for varietal mixtures in microplots

Preamble

Overview

Simulate data

Simulate the genotypes

Set the dimensions

SNP genotypes

Additive genetic relationships

Simulate the phenotypes

Incomplete, yet balanced design

Field trial

Design matrices

Parameters

Phenotypes

Infer the parameters

Model 1

Fit

Check

Model 2

Fit

Check

Model 3

Fit

Check

Model 2’

Fit

Check

Model 2’’

Fit

Check

Model 3’

Fit

Check

Appendix