Quick R demo for astrostatistics

R is the most comprehensive public-domain statistical software available today. An implementation of the S language for statistical analysis, it is increasingly popular in the statistical community. It takes only a few minutes to obtain: go to http://www.R-project.org , choose a CRAN mirror site, and download the executable appropriate for your operating system.

The following demonstration is a series of excerpts from a weeklong set of tutorials presented at the Summer School in Statistics for Astronomers and Physicists at Penn State University (http://astrostatistics.psu.edu/su06/index.html). When this demo is viewed online, the bolded, indented lines

# like this one

are meant to be copied and pasted directly into R at the command prompt.

Quick R demo

Load the Hipparcos dataset described at http://astrostatistics.psu.edu/datasets/HIP_star.html.

   hip <- read.table("http://astrostatistics.psu.edu/datasets/HIP_star.dat",
      header=T,fill=T)
   attach(hip)

Create a fancy boxplot.

   boxplot(Vmag~cut(B.V,breaks=(-1:6)/2),
      notch=T, varwidth=T, las=1, tcl=.5,
      xlab=expression("B minus V"),
      ylab=expression("V magnitude"),
      main="Can you find the red giants?",
      cex=1, cex.lab=1.4, cex.axis=.8, cex.main=1)
   axis(2, labels=F, at=0:12, tcl=-.25)
   axis(4, at=3*(0:4))

Now for a scatterplot.

   plot(RA,DE,pch=".")

Let's find the Hyades.

   filter1 <- (RA>50 & RA<100 & DE>0 & DE<25)
   plot(pmRA[filter1],pmDE[filter1],pch=20)
   rect(0,-150,200,50,border=2)

   plot(pmRA[filter1],pmDE[filter1],pch=20,
            xlim=c(0,200),ylim=c(-150,50))
   rect(90,-60,130,-10,border=2)
   filter2 <- (pmRA>90 & pmRA<130 & pmDE>-60 & pmDE< -10) # Space in 'pmDE< -10' is necessary!
   filter <- filter1 & filter2

Let's have a final look.

   pairs(hip[filter,-c(1,5)],pch=20)

There is one outlying star in the e_Plx variable. Get rid of it:

   filter <- filter & (e_Plx<5)
   sum(filter)

A final look at the HR plot, with the Hyades highlighted:

   plot(Vmag,B.V,pch=c(46,20)[1+filter], col=1+filter,
        xlim=range(Vmag[filter]), ylim=range(B.V[filter]))

Next, a regression example involving the dataset http://astrostatistics.psu.edu/datasets/GRB_afterglow.html:

   grb <- read.table ("http://astrostatistics.psu.edu/datasets/GRB_afterglow.dat",
       header=T, skip=1)
   x <- log10(grb[,1])
   y <- log10(grb[,2])
   plot(x,y,xlab="log time",ylab="log flux")

The relationship looks roughly linear, so let's try a linear model:

   model1 <- lm(y~x)
   abline(model1, col=2, lwd=2)
   model1 # same as print(model1)
   summary(model1)

We can look at a confidence ellipse for the slope and intercept using the "car" package, available from CRAN (http://cran.r-project.org/mirrors.html).

   library(car)
   confidence.ellipse(model1)

Next up, a quadratic fit (which is still linear regression, by the way):

   model2 <- lm(y~x+I(x^2))
   summary(model2)
   plot(x,y,xlab="log time",ylab="log flux")
   abline(model1, col=2, lwd=2)
   xx <- seq(min(x),max(x),len=200)
   yy <- model2$coef %*% rbind(1,xx,xx^2)
   lines(xx,yy,lwd=2,col=3)

Let's check AIC for the two models:

   AIC(model1)
   AIC(model2)

And here is the BIC:

   n <- length(x)
   AIC(model1, k=log(n))
   AIC(model2, k=log(n))

Let's try a nonparametric lowess fit.

   npmodel1 <- lowess(y~x)
   lines(npmodel1, col=4, lwd=2)

It is hard to see the pattern of the lowess curve in the plot. Let's replot it with no other distractions.

   plot(x,y,xlab="log time",ylab="log flux", type="n")
   lines(npmodel1, col=4, lwd=2)

What about nonlinear fit?

   flux <- grb[,2]
   nlsmodel1 <- nls(flux ~ I(10^(a+b*x)), start=list(a=0,b=0))
   nlsmodel1
   npmodel2 <- lowess(flux~x)
   plot(x, flux, xlab="log time", ylab="flux")
   lines(xx, 10^(4.1085-.9674*xx), col=2, lwd=2)
   lines(npmodel2, col=3, lwd=2)

Finally, resistant regression using the lqs function in the MASS package.

   library(MASS)
   lqsmodel1 <- lqs(y~x, method="lts")
   plot(x,y,xlab="log time",ylab="log flux")
   abline(model1,col=2)
   abline(lqsmodel1,col=3)

Here is a model-based clustering example applied to the COMBO-17 dataset (http://astrostatistics.psu.edu/datasets/COMBO17.html).

   combo <- read.csv(
        "http://astrostatistics.psu.edu/datasets/COMBO17.csv",
        na.strings="")
   attach(combo)
   xy <- cbind(BjMAG, S280MAG-BjMAG)
   xy <- xy[Mcz>.7 & Mcz<.9,]
   xy <- na.omit(xy)
   plot(xy[,1],xy[,2], pch=20,
     xlab=expression(M[B]),
     ylab= expression("(280-B)"[rest]),
     main="z between 0.7 and 0.9",
     cex.lab=1.5)

The mclust package will do the clustering.

   library(mclust)
   mc2 <- Mclust(xy, minG=2, maxG=2)
   r <- sqrt(qchisq(.5,2))
   for(i in 1:2)
        ellipse(mc2$mu[,i], mc2$sigma[,,i], r, col=1+i)

Is two components the "best" solution?

   mc <- Mclust(xy)
   mc$G
   plot(xy[,1],xy[,2], pch=20,
     xlab=expression(M[B]),
     ylab= expression("(280-B)"[rest]),
     main="z between 0.7 and 0.9",
     cex.lab=1.5)
   for(i in 1:mc$G)
        ellipse(mc$mu[,i], mc$sigma[,,i], r, col=1+i)