Plotting Input Data

To visualize the structure of the input interaction matrix we can use two built-in plotting functions.

plot_Z(com, tickMarks=20, cex.lab=1, cex.axis=1)

We can also use the function lof to left order the interaction matrix before plotting:

plot_Z(lof(com), tickMarks=20, cex.lab=1, cex.axis=1)

Another interesting property of networks to explore is the degree distribution. We can look at the degree distribution for both hosts and parasites with the following function:

plot_degree(com, cex.lab=1, cex.axis=1, pt.cex=1, legend.cex=1)

Running the model

Running for 1500 slices (iterations) on an i7-6560U and 16GB RAM took about 25 seconds.

out <- network_est(Z = com, tree = tree, slices = 1000, model.type = 'full')
#> [1] "Running full model..."
#> [1] "Run for 1000 slices with 500 burn-ins"
#> [1] "Matrix dimension: 70 x 442"
#> [1] "slice: 200, at 2020-04-28 21:43:02"
#> [1] "slice: 400, at 2020-04-28 21:43:08"
#> [1] "slice: 600, at 2020-04-28 21:43:15"
#> [1] "slice: 800, at 2020-04-28 21:43:21"
#> [1] "slice: 1000, at 2020-04-28 21:43:28"
#> [1] "Done!"
str(out)
#> List of 3
#>  $ param:List of 6
#>   ..$ w      : num [1:442, 1:500] 1.259 1.248 0.621 1.097 1.335 ...
#>   ..$ y      : num [1:70, 1:500] 0.441 11.398 1.097 3.013 6.011 ...
#>   ..$ eta    : num [1:500] 8.11 7.4 8.54 8.54 7.98 ...
#>   ..$ g      : NULL
#>   ..$ burn.in: num 500
#>   ..$ sd     :List of 3
#>   .. ..$ w  : num [1:442] 1.96 2 2.14 1.98 2.05 ...
#>   .. ..$ y  : num [1:70] 1.35 6.95 2.27 3.27 4.99 ...
#>   .. ..$ eta: num 0.808
#>  $ Z    : num [1:70, 1:442] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..- attr(*, "dimnames")=List of 2
#>   .. ..$ : chr [1:70] "Addax_nasomaculatus" "Oryx_gazella" "Hippotragus_equinus" "Hippotragus_niger" ...
#>   .. ..$ : chr [1:442] "Acinetobacter lwoffii" "Actinobacillus actinomycetemcomitans" "Africanostrongylus buceros" "Agriostomum cursoni" ...
#>  $ tree :List of 4
#>   ..$ edge       : int [1:118, 1:2] 71 72 73 74 75 76 76 75 77 77 ...
#>   ..$ edge.length: num [1:118] 0.1774 0.0604 0.0792 0.3736 0.0604 ...
#>   ..$ Nnode      : int 49
#>   ..$ tip.label  : chr [1:70] "Addax_nasomaculatus" "Oryx_gazella" "Hippotragus_equinus" "Hippotragus_niger" ...
#>   ..- attr(*, "class")= chr "phylo"
#>   ..- attr(*, "order")= chr "cladewise"

Example traceplots

# Affinity parameter of parasite 1
plot(out$param$w[1,],type="l", col=2, ylab="Affinity parameter for parasite 1", xlab="Iteration")


# Affinity parameter of host 1
plot(out$param$y[1,],type="l", col=4, ylab="Affinity parameter for host 1", xlab="Iteration")


# Phylogeny scaling parameter
plot(out$param$eta, type="l",col="darkgreen", ylab="Phylogeny scaling parameter (EB model)", xlab="Iteration")

Generating posterior probability matrix

P <- sample_parameter(param=out$param, MODEL="full", Z=out$Z, tree=out$tree, size = 500)

General approach to running the model with 5-fold cross validation

## General variables
MODEL = 'full'                       # full, distance or affinity
SLICE = 1000                          # no of iterations
NO.CORES = 3                         # maximum cores to use
COUNT = TRUE                         # TRUE = count data, FALSE = year of first pub.
ALPHA.ROWS = 0.3                     # hyperparameter for prior over rows affinity, effective under affinity and full models only
ALPHA.COLS = 0.3                     # hyperparameter for prior over columns affinity, effective under affinity and full models only

## Loading required packages
require(parallel)

## preparing tree and com
cleaned <- network_clean(com, tree, 'full')
com <- cleaned$Z                         # cleaned binary interaction matrix
tree <- cleaned$tree                     # cleaned tree

## indexing 5-folds of interactions
folds <- cross.validate.fold(com, n= 5, min.per.col=2)  
[1] "Actual cross-validation rate is 0.095"
[2] "Actual cross-validation rate is 0.095"
[3] "Actual cross-validation rate is 0.095"
[4] "Actual cross-validation rate is 0.095"
[5] "Actual cross-validation rate is 0.096"

# returns a matrix of 3 columns (row, col, group), (row, col) correspond to Z, group to the CV group
tot.gr <- length(unique(folds[,'gr']))   # total number of CV groups

## A loop to run over all CV groups
res <- mclapply(1:tot.gr, function(x, folds, Z, tree, slice, model.type, ALPHA.ROWS, ALPHA.COLS){

    ## Analysis for a single fold
    Z.train = Z
    Z.train[folds[which(folds[,'gr']==x),c('row', 'col')]]<-0

    ## running the model of interest
    obj = network_est(Z.train, slices=slice, tree=tree, model.type=model.type,
                      a_y = ALPHA.ROWS, a_w = ALPHA.COLS)

    P = sample_parameter(obj$param, model.type, Z.train, tree)
    Eta = if(is.null(obj$param$eta)) 0 else mean(obj$param$eta)
    
    ## order the rows in Z.test as in Z.train
    roc = rocCurves(Z, Z.train, P, plot=FALSE, bins=400, all=FALSE)
    tb  = ana.table(Z, Z.train, P, roc,  plot=FALSE)
    roc.all = rocCurves(Z, Z.train, P=P, plot=FALSE, bins=400, all=TRUE)
    tb.all  = ana.table(Z, Z.train, P, roc.all, plot=FALSE)
    
    list(param=list(P=P, Eta = Eta), tb = tb,
         tb.all = tb.all, FPR.all = roc.all$roc$FPR,
         TPR.all=roc.all$roc$TPR, FPR = roc$roc$FPR, TPR=roc$roc$TPR)
    
},  
    folds=folds, Z = com, tree=tree, model.type=MODEL, slice = SLICE,
    ALPHA.ROWS = ALPHA.ROWS, ALPHA.COLS= ALPHA.COLS, 
    mc.preschedule = TRUE, mc.cores = min(tot.gr, NO.CORES))
[1] "Running full model..."
[1] "Running full model..."
[1][1] "Running full model..."
 "Run for 1000 slices with 500 burn-ins"
[1] "Matrix dimension: 70 x 442"
[1] "Run for 1000 slices with 500 burn-ins"
[1] "Matrix dimension: 70 x 442"
[1] "Run for 1000 slices with 500 burn-ins"
[1] "Matrix dimension: 70 x 442"
[1] "slice: 200, at 2020-04-18 18:08:37"
[1] "slice: 200, at 2020-04-18 18:08:37"
[1] "slice: 200, at 2020-04-18 18:08:37"
[1] "slice: 400, at 2020-04-18 18:08:41"
[1] "slice: 400, at 2020-04-18 18:08:41"
[1] "slice: 400, at 2020-04-18 18:08:41"
[1] "slice: 600, at 2020-04-18 18:08:45"
[1] "slice: 600, at 2020-04-18 18:08:45"
[1] "slice: 600, at 2020-04-18 18:08:45"
[1] "slice: 800, at 2020-04-18 18:08:51"
[1] "slice: 800, at 2020-04-18 18:08:51"
[1] "slice: 800, at 2020-04-18 18:08:51"
[1] "slice: 1000, at 2020-04-18 18:08:57"
[1] "Done!"
[1] "slice: 1000, at 2020-04-18 18:08:57"
[1] "Done!"
[1] "slice: 1000, at 2020-04-18 18:08:58"
[1] "Done!"
[1] "Running full model..."
[1] "Run for 1000 slices with 500 burn-ins"
[1] "Matrix dimension: 70 x 442"
[1] "Running full model..."
[1] "Run for 1000 slices with 500 burn-ins"
[1] "Matrix dimension: 70 x 442"
[1] "slice: 200, at 2020-04-18 18:09:07"
[1] "slice: 200, at 2020-04-18 18:09:07"
[1] "slice: 400, at 2020-04-18 18:09:12"
[1] "slice: 400, at 2020-04-18 18:09:12"
[1] "slice: 600, at 2020-04-18 18:09:18"
[1] "slice: 600, at 2020-04-18 18:09:19"
[1] "slice: 800, at 2020-04-18 18:09:25"
[1] "slice: 800, at 2020-04-18 18:09:26"
[1] "slice: 1000, at 2020-04-18 18:09:32"
[1] "Done!"
[1] "slice: 1000, at 2020-04-18 18:09:32"
[1] "Done!"

We can analyze the performance of the model via the area under the receiver operating characteristic curve (AUC), and the proportion of 1s in the original data successfully recovered.

#> Loading required package: parallel
#>    m.auc m.pred.held.out.ones   m.thresh m.hold.out
#> 1 0.9249             87.17949 0.04260652        117
#> 2 0.9211             88.03419 0.05263158        117
#> 3 0.9380             90.59829 0.04511278        117
#> 4 0.9258             84.61538 0.05764411        117
#> 5 0.9445             92.37288 0.04761905        118
#> [1] "Model: full, AUC: 0.930860 and percent 1 recovered from held out: 88.560046"

We can also construct the posterior probability matrix ‘P’ as the average across each fold, and look at the top undocumented interactions.

We can also compare the input matrix to the posterior interaction matrix, and the orginal phylogeny compared to the phylogeny with estimated EB scaling.

par(mfrow=c(1,2))

## printing input Z
plot_Z(com, tickMarks=20)

## printing posterior interaction matrix
plot_Z(1*(P > mean(sapply(res, function(r) r$tb$thres))), tickMarks=20)


## printing input tree
plot(tree, show.tip.label=FALSE)

## printing output tree
if(grepl('(full|dist)', MODEL)){
    Eta = mean(sapply(res, function(r) r$param$Eta))
    print(paste('Eta is', Eta))
    plot(rescale(tree, 'EB', Eta), show.tip.label=FALSE)
}
#> [1] "Eta is 6.83615118463669"

References

Elmasri, M., Farrell, M. J., Davies, T. J., & Stephens, D. A. (2020). A hierarchical Bayesian model for predicting ecological interactions using scaled evolutionary relationships. Annals of Applied Statistics, 14(1), 221-240.