R is the
most comprehensive public-domain statistical software available today.
An implementation of the S language for statistical analysis, it is
increasingly popular in the statistical community. It takes only a few minutes
to obtain: go to
http://www.R-project.org , choose a CRAN mirror site, and download the
executable appropriate for your operating system.
The following demonstration is a series of excerpts from a weeklong set of tutorials
presented at the Summer School in Statistics for Astronomers and Physicists
at Penn State University (http://astrostatistics.psu.edu/su06/index.html).
When this demo is
viewed online, the bolded, indented lines
# like this one
are meant to be copied and pasted directly into R at the command prompt.
Quick R demo
Load the Hipparcos dataset described at
http://astrostatistics.psu.edu/datasets/HIP_star.html.
hip <- read.table("http://astrostatistics.psu.edu/datasets/HIP_star.dat",
header=T,fill=T)
attach(hip)
Create a fancy boxplot.
boxplot(Vmag~cut(B.V,breaks=(-1:6)/2),
notch=T, varwidth=T, las=1, tcl=.5,
xlab=expression("B minus V"),
ylab=expression("V magnitude"),
main="Can you find the red giants?",
cex=1, cex.lab=1.4, cex.axis=.8, cex.main=1)
axis(2, labels=F, at=0:12, tcl=-.25)
axis(4, at=3*(0:4))
Now for a scatterplot.
plot(RA,DE,pch=".")
Let's find the Hyades.
filter1 <- (RA>50 & RA<100
& DE>0 & DE<25)
plot(pmRA[filter1],pmDE[filter1],pch=20)
rect(0,-150,200,50,border=2)
plot(pmRA[filter1],pmDE[filter1],pch=20,
xlim=c(0,200),ylim=c(-150,50))
rect(90,-60,130,-10,border=2)
filter2 <- (pmRA>90 & pmRA<130
& pmDE>-60 & pmDE< -10) # Space in 'pmDE< -10' is necessary!
filter <- filter1 & filter2
Let's have a final look.
pairs(hip[filter,-c(1,5)],pch=20)
There is one outlying star in the e_Plx variable. Get rid of it:
filter <- filter & (e_Plx<5)
sum(filter)
A final look at the HR plot, with the Hyades highlighted:
plot(Vmag,B.V,pch=c(46,20)[1+filter],
col=1+filter,
xlim=range(Vmag[filter]), ylim=range(B.V[filter]))
Next, a regression example involving the dataset
http://astrostatistics.psu.edu/datasets/GRB_afterglow.html:
grb <- read.table
("http://astrostatistics.psu.edu/datasets/GRB_afterglow.dat",
header=T, skip=1)
x <- log10(grb[,1])
y <- log10(grb[,2])
plot(x,y,xlab="log time",ylab="log flux")
The relationship looks roughly linear, so let's try a
linear model:
model1 <- lm(y~x)
abline(model1, col=2, lwd=2)
model1 # same as print(model1)
summary(model1)
We can look at a confidence ellipse for the slope and intercept using the
"car" package, available from CRAN
(http://cran.r-project.org/mirrors.html).
library(car)
confidence.ellipse(model1)
Next up, a quadratic fit (which is still linear regression, by the way):
model2 <- lm(y~x+I(x^2))
summary(model2)
plot(x,y,xlab="log time",ylab="log flux")
abline(model1, col=2, lwd=2)
xx <- seq(min(x),max(x),len=200)
yy <- model2$coef %*%
rbind(1,xx,xx^2)
lines(xx,yy,lwd=2,col=3)
Let's check AIC for the two models:
AIC(model1)
AIC(model2)
And here is the BIC:
n <- length(x)
AIC(model1, k=log(n))
AIC(model2, k=log(n))
Let's try a nonparametric
lowess fit.
npmodel1 <- lowess(y~x)
lines(npmodel1, col=4, lwd=2)
It is hard to see the pattern of the lowess curve in the plot. Let's replot it with
no other distractions.
plot(x,y,xlab="log time",ylab="log flux",
type="n")
lines(npmodel1, col=4, lwd=2)
What about nonlinear fit?
flux <- grb[,2]
nlsmodel1 <- nls(flux ~ I(10^(a+b*x)),
start=list(a=0,b=0))
nlsmodel1
npmodel2 <- lowess(flux~x)
plot(x, flux, xlab="log time",
ylab="flux")
lines(xx, 10^(4.1085-.9674*xx), col=2, lwd=2)
lines(npmodel2, col=3, lwd=2)
Finally, resistant regression using the
lqs
function in the MASS package.
library(MASS)
lqsmodel1 <- lqs(y~x, method="lts")
plot(x,y,xlab="log time",ylab="log flux")
abline(model1,col=2)
abline(lqsmodel1,col=3)
Here is a model-based clustering example applied to the
COMBO-17 dataset
(http://astrostatistics.psu.edu/datasets/COMBO17.html).
combo <- read.csv(
"http://astrostatistics.psu.edu/datasets/COMBO17.csv",
na.strings="")
attach(combo)
xy <- cbind(BjMAG, S280MAG-BjMAG)
xy <- xy[Mcz>.7 &
Mcz<.9,]
xy <- na.omit(xy)
plot(xy[,1],xy[,2], pch=20,
xlab=expression(M[B]),
ylab=
expression("(280-B)"[rest]),
main="z between 0.7 and 0.9",
cex.lab=1.5)
The mclust package will do the clustering.
library(mclust)
mc2 <- Mclust(xy, minG=2, maxG=2)
r <- sqrt(qchisq(.5,2))
for(i in 1:2)
ellipse(mc2$mu[,i], mc2$sigma[,,i], r, col=1+i)
Is two components the "best" solution?
mc <- Mclust(xy)
mc$G
plot(xy[,1],xy[,2], pch=20,
xlab=expression(M[B]),
ylab=
expression("(280-B)"[rest]),
main="z between 0.7 and 0.9",
cex.lab=1.5)
for(i in 1:mc$G)
ellipse(mc$mu[,i], mc$sigma[,,i], r, col=1+i)