Material suplementar

Walkthrough do código — Aula 3: Estimativa de estado ancestral e sinal filogenético

Walkthrough do código usado em sala. Os comandos não são executados pelo renderizador — copie e cole no seu R local. Você pode baixar o script .R original e os datasets em Aula 3.

decdiv <- function(phy, df, dis = NULL, tol = 1e-08){
    
    if(is.vector(df)){
        df <- cbind.data.frame(df)
    }
    if(!is.data.frame(df)) stop("df should be a data frame")
    if(any(apply(df, 2, sum)<tol)) stop("null column in df")
    if(any(df < -tol)) stop("negative values in df")
    df[df < tol] <- 0
    df <- as.data.frame(apply(df, 2, function(x) x/sum(x)))
    
    disc2 <- function(samples, dis = NULL, structures = NULL, tol = 1e-08) 
    {
        if (!inherits(samples, "data.frame")) 
            stop("Non convenient samples")
        if (any(samples < 0)) 
            stop("Negative value in samples")
        if (any(apply(samples, 2, sum) < 1e-16)) 
            stop("Empty samples")
        if (!is.null(dis)) {
            if (!inherits(dis, "dist")) 
                stop("Object of class 'dist' expected for distance")
            if (!is.euclid(dis)) 
                warning("Euclidean property is expected for distance")
            dis <- as.matrix(dis)
            if (nrow(samples) != nrow(dis)) 
                stop("Non convenient samples")
        }
        if (is.null(dis)) 
            dis <- (matrix(1, nrow(samples), nrow(samples)) - diag(rep(1, 
                nrow(samples)))) * sqrt(2)
        if (!is.null(structures)) {
            if (!inherits(structures, "data.frame")) 
                stop("Non convenient structures")
            m <- match(apply(structures, 2, function(x) length(x)), 
                ncol(samples), 0)
            if (length(m[m == 1]) != ncol(structures)) 
                stop("Non convenient structures")
            m <- match(tapply(1:ncol(structures), as.factor(1:ncol(structures)), 
                function(x) is.factor(structures[, x])), TRUE, 0)
            if (length(m[m == 1]) != ncol(structures)) 
                stop("Non convenient structures")
        }
        Structutil <- function(dp2, Np, unit) {
            if (!is.null(unit)) {
                modunit <- model.matrix(~-1 + unit)
                sumcol <- apply(Np, 2, sum)
                Ng <- modunit * sumcol
                lesnoms <- levels(unit)
            }
            else {
                Ng <- as.matrix(Np)
                lesnoms <- colnames(Np)
            }
            sumcol <- apply(Ng, 2, sum)
            Lg <- t(t(Ng)/sumcol)
            colnames(Lg) <- lesnoms
            Pg <- as.matrix(apply(Ng, 2, sum)/nbhaplotypes)
            rownames(Pg) <- lesnoms
            deltag <- as.matrix(apply(Lg, 2, function(x) t(x) %*% 
                dp2 %*% x))
            ug <- matrix(1, ncol(Lg), 1)
            dg2 <- t(Lg) %*% dp2 %*% Lg - 1/2 * (deltag %*% t(ug) + 
                ug %*% t(deltag))
            colnames(dg2) <- lesnoms
            rownames(dg2) <- lesnoms
            return(list(dg2 = dg2, Ng = Ng, Pg = Pg))
        }
        Diss <- function(dis, nbhaplotypes, samples, structures) {
            structutil <- list(0)
            structutil[[1]] <- Structutil(dp2 = dis, Np = samples, 
                NULL)

            ###
            diss <- list(as.dist(structutil[[1]]$dg2))
            fun1 <- function(x){
                y <- x
                y[y<tol] <- 0
                return(y)
            }
            diss <- lapply(diss, fun1)
            diss <- lapply(diss, function(x) sqrt(2*x))
            ###

            if (!is.null(structures)) {
                for (i in 1:length(structures)) {
                    structutil[[i + 1]] <- Structutil(structutil[[1]]$dg2, 
                    structutil[[1]]$Ng, structures[, i])
                }
                ###
                diss <- c(diss, tapply(1:length(structures), factor(1:length(structures)),
                    function(x) (as.dist(structutil[[x + 1]]$dg2))))
                diss <- lapply(diss, fun1)
                diss <- lapply(diss, function(x) sqrt(2*x))
                ###
            }
            return(diss)
        }
        nbhaplotypes <- sum(samples)
        diss <- Diss(dis^2, nbhaplotypes, samples, structures)
        names(diss) <- c("samples", names(structures))
        if (!is.null(structures)) {
            return(diss)
        }
        return(diss$samples)
    }
   
    decdivV <- function(freq){
       nsp <- length(phy$leaves)
       nnodes <- length(phy$nodes)
       matno <- as.data.frame(matrix(0, nnodes, nsp))
       rownames(matno) <- names(phy$nodes)
       names(matno) <- names(phy$leaves)
       for(i in 1:nsp){
          matno[phy$path[[i]][-length(phy$path[[i]])], i] <- 1
       }
       matfr <- as.matrix(matno) %*% diag(freq)
       matfr2 <- as.data.frame(t(matfr))
        divno <- divc(matfr2, dis)
        matfr3 <- cbind.data.frame(matfr2, diag(freq))
        names(matfr3) <- c(names(matfr2), names(phy$leaves))
        matfr4 <- matfr3[, apply(matfr3, 2, sum)!=0]
        if(ncol(matfr4)==0) stop("only one species considered")
        discno <- disc2(matfr4, dis, tol = tol)
        lambdano <- apply(matfr4, 2, sum)
        prdist <- diag(lambdano)%*%as.matrix(discno^2/2)%*%diag(lambdano)
        colnames(prdist) <- rownames(prdist) <- names(matfr4)
        fun1 <- function(x){
            x <- x[apply(matfr3[x], 2, sum)!=0]
            if(length(x) == 1) return(0)
            else return(sum(prdist[x, x])/2)
        }
        res <- unlist(lapply(phy$parts, fun1))
        lambdano <- apply(matfr3, 2, sum)
        lambdano[lambdano < tol] <- 1
        res <- res * 1/as.vector(lambdano)[1:nnodes]
       return(res)
    }
    return(apply(df, 2, decdivV))
}

plot.decdiv <- 
function (x, y = NULL, f.phylog = 0.5, cleaves = 1, cnodes = 0, vnodes = NULL, vcolor = "white", adjust = 1, vmax = max(vnodes),
    labels.leaves = names(x$leaves), clabel.leaves = 1, labels.nodes = names(x$nodes), 
    clabel.nodes = 0, sub = "", csub = 1.25, possub = "topleft", 
    draw.box = FALSE, clegend = 1, ...) 
{
    if (!inherits(x, "phylog")) 
        stop("Non convenient data")
    leaves.number <- length(x$leaves)
    leaves.names <- names(x$leaves)
    nodes.number <- length(x$nodes)
    nodes.names <- names(x$nodes)
    if (length(labels.leaves) != leaves.number) 
        labels.leaves <- names(x$leaves)
    if (length(labels.nodes) != nodes.number) 
        labels.nodes <- names(x$nodes)
    leaves.car <- gsub("[_]", " ", labels.leaves)
    nodes.car <- gsub("[_]", " ", labels.nodes)
    mar.old <- par("mar")
    on.exit(par(mar = mar.old))
    par(mar = c(0.1, 0.1, 0.1, 0.1))
    if (f.phylog < 0.05) 
        f.phylog <- 0.05
    if (f.phylog > 0.95) 
        f.phylog <- 0.95
    maxx <- max(x$droot)
    plot.default(0, 0, type = "n", xlab = "", ylab = "", xaxt = "n", 
        yaxt = "n", xlim = c(-maxx * 0.15, maxx/f.phylog), ylim = c(-0.05, 
            1), xaxs = "i", yaxs = "i", frame.plot = FALSE)
    x.leaves <- x$droot[leaves.names]
    x.nodes <- x$droot[nodes.names]

    reglage <- maxx * 0.10 * adjust
    cex0 <- par("cex") * clegend
    ###
        legender <- function(br0, sq0, clegend) {
        sq0 <- round(sq0, dig = 3)
        cha <- as.character(sq0[1])
        for (i in (2:(length(sq0)))) cha <- paste(cha, sq0[i], 
            sep = " ")
        yh <- max(c(strheight(cha, cex = cex0), br0))
        h <- strheight(cha, cex = cex0)
        y0 <- par("usr")[3] + yh / (par("usr")[2] - par("usr")[1]) * (par("usr")[4] - par("usr")[3])
        x0 <- par("usr")[1] + h/2
        for (i in (1:(length(br0)))) {
            cha <- sq0[i]
            cha <- paste(" ", cha, sep = "")
            xh <- strwidth(cha, cex = cex0)
            text(x0 + xh/2, y0, cha, cex = cex0)
            z0 <- br0[i]
            x0 <- x0 + xh + z0
            symbols(x0, y0, circles = z0, bg = vcolor, 
                    fg = "black", add = TRUE, inch = FALSE)
            x0 <- x0 + z0
        }
        invisible()
    }
    if (clegend > 0) {
            sq0 <- pretty(c(vnodes, vmax), 4)
            l0 <- length(sq0)
            sq0 <- (sq0[1:(l0 - 1)] + sq0[2:l0])/2
            br0 <- (sq0 / vmax * reglage)
            legender(br0, sq0, clegend = clegend)
    }
    ###
    
    
    ###
    if(clegend > 0){
        cha <- as.character(sq0[1])
        yh <- max(c(strheight(cha, cex = cex0), br0*2))
        decalage <- yh / (par("usr")[2]-par("usr")[1]) * (par("usr")[4]-par("usr")[3]) - 0.05
    }
    else decalage <- 0
    ypos <- seq(decalage, 1, length = leaves.number + 2)[(leaves.number:1)+1]
    if(!is.null(y)) ypos <- ypos[y] 
    ###
    
    names(ypos) <- leaves.names
    xcar <- maxx * 1.05
    xx <- c(x.leaves, x.nodes)
    if (clabel.leaves > 0) {
        for (i in 1:leaves.number) {
            text(xcar, ypos[i], leaves.car[i], adj = 0, cex = par("cex") * 
                clabel.leaves)
            segments(xcar, ypos[i], xx[i], ypos[i], col = grey(0.7))
        }
    }
    yleaves <- ypos[1:leaves.number]
    xleaves <- xx[1:leaves.number]

    if (cleaves > 0) {
        for (i in 1:leaves.number) {
            points(xx[i], ypos[i], pch = 21, bg = 1, cex = par("cex") * 
                cleaves)
        }
    }
    yn <- rep(0, nodes.number)
    names(yn) <- nodes.names
    y <- c(ypos, yn)
    for (i in 1:length(x$parts)) {
        w <- x$parts[[i]]
        but <- names(x$parts)[i]
        ypos[but] <- mean(ypos[w])
        b <- range(ypos[w])
        segments(xx[but], b[1], xx[but], b[2])
        x1 <- xx[w]
        y1 <- ypos[w]
        x2 <- rep(xx[but], length(w))
        segments(x1, y1, x2, y1)
    }
    if (cnodes > 0 | !is.null(vnodes)) {
        sq <- vnodes /vmax
        names(sq) <- nodes.names
        for (i in nodes.names) {
            if(length(vnodes)!= length(x$nodes))
                points(xx[i], ypos[i], pch = 21, bg = "white", cex = cnodes)
            else{
                ###
                radi <- sq[i] * reglage
                if(sq[i] > 0)
                symbols(xx[i], ypos[i], circles = radi, 
                   bg = vcolor, fg = "black", add = TRUE, inch = FALSE)
                ###
                }
        }
    }

    if (clabel.nodes > 0) {
        scatterutil.eti(xx[names(x.nodes)], ypos[names(x.nodes)], 
            labels.nodes, clabel.nodes)
    }
    x <- (x.leaves - par("usr")[1])/(par("usr")[2] - par("usr")[1])
    ypos <- ypos[leaves.names]
    xbase <- (xcar - par("usr")[1])/(par("usr")[2] - par("usr")[1])
    if (csub > 0) 
        scatterutil.sub(sub, csub = csub, possub = possub)
    if (draw.box) 
        box()
    if (cleaves > 0) 
        points(xleaves, yleaves, pch = 21, bg = 1, cex = par("cex") * 
            cleaves)

    return(invisible(list(xy = data.frame(x = x, y = ypos), xbase = xbase, 
        cleaves = cleaves)))
}

rtest.decdiv <- function(phy, freq, dis = NULL, nrep = 99, vranking = "complexity", ties.method = "average", option = 1:3, optiontest = NULL, tol = 1e-08)
{

    #*******************************************************************************#
    #                         Checking of the parameters                            #
    #*******************************************************************************#
    
    if(!is.vector(freq)) stop("freq must be a unique vector")
    if (!is.numeric(nrep) | nrep <= 1) 
        stop("Non convenient nrep")
    if(sum(freq) < tol) stop("empty sample")
    if(any(freq < -tol)) stop("negative values in df")
    
    #*******************************************************************************#
    #                               Basic notations                                 #
    #*******************************************************************************#
    
    freq[freq < tol] <- 0
    freq <- freq / sum(freq)
    
    nsp <- length(phy$leaves)
    nnodes <- length(phy$nodes)
    if(is.null(dis))
    dis <- as.dist(sqrt(2*matrix(1, nsp, nsp) - diag(rep(1, nsp))))
    
    #*******************************************************************************#
    #                               Node ranking                                    #
    #*******************************************************************************#

    complexity <- function(phy){   

        matno <- as.data.frame(matrix(0, nnodes, nnodes))
        rownames(matno) <- names(phy$nodes)
        names(matno) <- names(phy$nodes)
        pathnodes <- phy$path[-(1:nsp)]
        for(i in 1:nnodes){
            matno[pathnodes[[i]], i] <- 1
        }
        listno <- lapply(1:nnodes, function(i) names(matno)[matno[i, ] > 0])
        names(listno) <- names(phy$nodes)
        nbdes <- cbind.data.frame(lapply(phy$parts, function(x) prod(1:length(x))))
        compl <- lapply(listno, function(x) prod(nbdes[x]))
        compltab <- cbind.data.frame(compl)
        compltab <- cbind.data.frame(t(compltab))
        names(compltab) <- "complexity"
        return(compltab)
        
    }

    droot <- function(phy){
        roottab <- cbind.data.frame(phy$droot[-(1:nsp)])
        names(roottab) <- "droot"
        return(roottab)
    }

    if(is.numeric(vranking)){
        vrank <- as.data.frame(rank(vranking, ties.method = ties.method))
        names(vrank) <- "free"
    }
    else
    vrank <- sapply(vranking, function(x) rank(get(x)(phy), ties.method = ties.method))
   
    if(!any(option == 3))
        r1 <- length(option)
    else
        r1 <- length(option) + length(vranking) - 1

    #*******************************************************************************#
    #                       Field observations                                      #
    #*******************************************************************************#
    
    vobs <- decdiv(phy, freq, dis, tol = 1e-08)

    #*******************************************************************************#
    #                       Statistics for the four tests                           #
    #*******************************************************************************#    
    
    namnodes <- rownames(vobs)
    stat1 <- function(v){
        v <- v/sum(v)
        return(max(v))
    }
    stat2 <- function(v){
        v <- v/sum(v)
        fun1 <- function(m){
            return(abs(sum(sort(v)[1:m]) - m/nnodes))
        }
        return(max(unlist(lapply(1:nnodes, fun1))))
    }
    stat3 <- function(v){
        # this statistics has been sightly changed because the consideration of ties in Ollier et al.
        # was not explained, althought ties always happen with such a methodology. 
        funstat3 <- function(vrank1){
            v <- v/sum(v)
            return(sum(rank(vrank1, ties.method = ties.method)*v)/nnodes)
        }
        return(apply(vrank, 2, funstat3))
    }

    methods <- c("stat1", "stat2", "stat3")[option]
    
    #*******************************************************************************#
    #                      Statistics on field observations                         #
    #*******************************************************************************#
    
    statobs <- unlist(sapply(methods, function(x) get(x)(vobs[, 1])))

    #*******************************************************************************#
    #                              Permutation scheme                               #
    #*******************************************************************************#     
    
    funperm <- function(i){
        e <- sample(1:nsp)
        vtheo <- decdiv(phy, freq[e], as.dist(as.matrix(dis)[e, e]), tol = tol)
        stattheo <- unlist(sapply(methods, function(x) get(x)(vtheo[, 1])))
        return(stattheo)
    }
    
    tabsimu <- as.data.frame(t(cbind.data.frame(lapply(1:nrep, funperm))))
    rownames(tabsimu) <- paste("t", 1:nrep, sep="")
    if(r1 == 2 & methods[1] == "stat3")
        names(tabsimu) <- paste("stat3", names(tabsimu), sep=".")
    
    #*******************************************************************************#
    #                                     End                                       #
    #*******************************************************************************# 
    
    optiondefault <- c("greater", "greater", "two-sided", "two-sided", "two-sided")
    names(optiondefault) <- c("stat1", "stat2", "stat3.complexity", "stat3.droot", "stat3.free")
    
    if(r1 == 1)
    {
    if(!is.null(optiontest))
    return(as.randtest(obs = statobs, sim = tabsimu[, 1], alter = optiontest, call = "rtest.decdiv"))
    else
    return(as.randtest(tabsimu[, 1], statobs, alter = optiondefault[names(tabsimu)], call = "rtest.decdiv"))
    }
    
    if(!is.null(optiontest))
    return(as.krandtest(obs = statobs, sim = tabsimu, alter = optiontest, call = "rtest.decdiv"))
    else
    return(as.krandtest(obs = statobs, sim = tabsimu, alter = optiondefault[names(tabsimu)], 
        call = "rtest.decdiv"))

}