Issues from Graphical Models and Bayesian Networks

Tutorial on July 3rd in Rennes,

Soren Hojsgaard

Issue 1


To plot a graph, the Rgraphviz package must be loaded and this must unfortunately be done manually:

library(gRain)
library(Rgraphviz) 
d <- dag(~ch|fa:mo + ch.pht|ch + mo.pht|mo + fa.pht|fa)
plot( d )

plot of chunk unnamed-chunk-1

Issue 2


Conditional independence in the multivariate normal distribution

K <- matrix(c(2,1,0,
              1,2,1,
              0,1,2), 3, 3)
Sigma <- solve(K)
Sigma
##       [,1] [,2]  [,3]
## [1,]  0.75 -0.5  0.25
## [2,] -0.50  1.0 -0.50
## [3,]  0.25 -0.5  0.75
d <- data.frame(MASS::mvrnorm(100, mu=c(0,0,0), Sigma=Sigma))

Marginally, all variables are correlated:

pairs(d)

plot of chunk unnamed-chunk-3

cor(d)
##            X1         X2         X3
## X1  1.0000000 -0.4754942  0.3073193
## X2 -0.4754942  1.0000000 -0.5469225
## X3  0.3073193 -0.5469225  1.0000000

Regress X1 on X2 and X3 on X2 and look at the residuals

r1.2 <- resid( lm( X1 ~ X2, data=d) )
r3.2 <- resid( lm( X3 ~ X2, data=d) )
plot(r1.2, r3.2)

plot of chunk unnamed-chunk-4

cor(r1.2, r3.2)
## [1] 0.06417072

The residuals are now uncorrelated so after adjusting for X2, any association between X1 and X3 has been removed. An alternative view is this: Regress X1 on X2 and X3:

coef( summary( lm( X1 ~ X2 + X3, data=d) ) )
##                Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)  0.02473166 0.07332566  0.3372852 7.366306e-01
## X2          -0.36729498 0.08916068 -4.1194726 7.994972e-05
## X3           0.05964660 0.09418179  0.6333136 5.280193e-01

Cleary X1 does not depend on X3 once we know X2.

Issue 3


Getting the phenotype into the model

library(gRain)
library(Rgraphviz)

What we did in the tutorial

inheritance <- mendel( c("y","g"), names = c("ch", "fa", "mo") )$prob
head( inheritance )
## [1] 1.0 0.0 0.0 0.5 0.5 0.0
gts <- c("yy", "yg", "gg")
gtprobs <- c(.25, .50, .25)
ssp <- list(fa = gts, mo = gts, ch = gts)
p.c_fm <- arMk(~ch:fa:mo, levels=ssp, values=inheritance)
p.fa <- arMk(~fa, levels=ssp, values=gtprobs)
p.mo <- arMk(~mo, levels=ssp, values=gtprobs)
ftable( p.c_fm, row.vars = "ch" )
##    fa   yy             yg             gg          
##    mo   yy   yg   gg   yy   yg   gg   yy   yg   gg
## ch                                                
## yy    1.00 0.50 0.00 0.50 0.25 0.00 0.00 0.00 0.00
## yg    0.00 0.50 1.00 0.50 0.50 0.50 1.00 0.50 0.00
## gg    0.00 0.00 0.00 0.00 0.25 0.50 0.00 0.50 1.00
cptl <- compileCPT( list(p.fa, p.mo, p.c_fm ) ); cptl
## CPTspec with probabilities:
##  P( fa )
##  P( mo )
##  P( ch | fa mo )
trio <- grain( cptl )
plot( trio )

plot of chunk unnamed-chunk-7

We extend network with phenotypes (the colours that can be observed):

ptprobs <- c(1,0,1,0,0,1)
pt <- c("Y", "G")
ssp <- list(fa = gts, mo = gts, ch = gts, ch.pt=pt, fa.pt=pt, mo.pt=pt)
p.cpt <- arMk(~ch.pt:ch, levels=ssp, values=ptprobs)
p.fpt <- arMk(~fa.pt:fa, levels=ssp, values=ptprobs)
p.mpt <- arMk(~mo.pt:mo, levels=ssp, values=ptprobs)

A logical array:

p.cpt
##      ch
## ch.pt yy yg gg
##     Y  1  1  0
##     G  0  0  1

Now we have

cptl2 <- compileCPT( list( p.cpt, p.fpt, p.mpt, p.fa, p.mo, p.c_fm ) ); cptl
## CPTspec with probabilities:
##  P( fa )
##  P( mo )
##  P( ch | fa mo )
trio2 <- grain( cptl2 )
plot( trio2 )

plot of chunk unnamed-chunk-10

Now we can condition on the phenotype

qgrain(trio2, nodes=c("ch","fa","mo"))
## $ch
## ch
##   yy   yg   gg 
## 0.25 0.50 0.25 
## 
## $fa
## fa
##   yy   yg   gg 
## 0.25 0.50 0.25 
## 
## $mo
## mo
##   yy   yg   gg 
## 0.25 0.50 0.25

When child is yellow, the genotype must be yy or yg

qgrain(trio2, evidence=list(ch.pt="Y"), nodes=c("ch","fa","mo"))
## $ch
## ch
##        yy        yg        gg 
## 0.3333333 0.6666667 0.0000000 
## 
## $fa
## fa
##        yy        yg        gg 
## 0.3333333 0.5000000 0.1666667 
## 
## $mo
## mo
##        yy        yg        gg 
## 0.3333333 0.5000000 0.1666667

We don't know which genotype it is, but yg is still twice as probable as yy - just as the case was before conditioning