I build from time some small functions and make them available.
## Lags and forwards
# Simplelag gets a variable value from a given line above. Forward gets the value from a given line below. It's inspired from the lag and forward function in SAS.
#mylag: number of lags
#myforward: number of forward
#by: enables to do a lag or of forward by a grouping variable
#outside: default value when lags or forwards are not available
simplelag <-function(x,by=NULL,mylag=1,outside=NA)
{
myend<-length(x)- mylag
if (!is.null(by)) {
lby<-c(replicate(mylag,""),as.character(by[1:myend]))
y0<-c(replicate(mylag,outside),x[1:myend])
y<-ifelse(as.character(by)==lby,y0,outside)
}
else {
y<-c(replicate(mylag,outside),x[1:myend])
}
}
forward<-function(x,by=NULL,myforward=1,outside=NA)
{
mystart<-myforward+1
if (!is.null(by)) {
fby<-c(as.character(by[mystart:length(x)]),replicate(myforward,""))
y0<-c(x[mystart:length(x)],replicate(myforward,outside))
y<-ifelse(as.character(by)==fby,y0,outside)
}
else {
y<-c(x[mystart:length(x)],replicate(myforward,outside))
}
}
#Example
#Value of d$x one row above
d$lag_x<-simplelag(d$x)
#Value of d$x three rows above, 0 in the two first rows
d$lag3_x<-simplelag(d$x,mylag=3, outside=0)
#Value of d$x two rows below
d$fwd2_x<-forward(d$x,myforward=2)
#Value of d$x one row above if in the same group, 0 otherwise
d$lag_x<-simplelag(d$x,by=d$group,outside=0)
## Retain and obtain
Retain is the equivalent of the retain instruction in SAS. Obtain is the opposite of Retain.
retain<-function(x,event,outside=NA)
{
indices <- c(1,which(event==TRUE), length(x)+1)
values <- c(outside,x[event %in% TRUE])
y<- rep(values, diff(indices))
}
obtain<-function(x,event,outside=NA)
{
indices <- c(0,which(event==TRUE), length(x))
values <- c(x[event %in% TRUE],outside)
y<- rep(values, diff(indices))
}
#Example
#Retain in the following rows the last x where event =1
d$last_x<-retain(d$x,d$event==1)
#Obtain in the first rows the first x where event =2
d$first_x<-obtain(d$x,d$event==2)
## Winsoring
winsor<-function(x,tmax=0.99,tmin=0.01)
{
y<-pmin(x,quantile(x,probs=tmax,na.rm=TRUE))
y<-pmax(y,quantile(y,probs=tmin,na.rm=TRUE))
}
## egen : Statistics by group
# DEVALUATED. I discovered one could do exactly the same things with the function ave which is in the R default package.
d$sd_x<-ave(d$x,d$category,FUN=sd)
# Egen produces statistics of a variable by group, such as mean, median, standard deviation, etc. It is the equivalent of Stata's egen function.
egen<-function (x,y,myfun=mean)
{
data_in<-data.frame(x,y)
data_out<-within(data_in,{z = ave(x,y,FUN=myfun,na.rm=TRUE)} )
return(data_out$z)
}
#Example
#Mean of x by category
d$mean_x<-egen(d$x,d$category)
#Standard deviation of x by category
d$sd_x<-egen(d$x,d$category,myfun=sd)
#Median of x by category
d$median_x<-egen(d$x,d$category,myfun=median)
## Triming
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
## Reversing a string
strReverse <- function(x) sapply(lapply(strsplit(x, NULL), rev), paste, collapse="")
## Clustering
# clx and mclx enable to robust-cluster standard errors in a regressions. Those functions are not mine. clx clusters according to one variable, mclx according to two variables.
clx <-
function(fm, dfcw, cluster){
library(sandwich)
library(lmtest)
M <- length(unique(cluster))
N <- length(cluster)
dfc <- (M/(M-1))*((N-1)/(N-fm$rank))
u <- apply(estfun(fm),2,
function(x) tapply(x, cluster, sum))
vcovCL <- dfc*sandwich(fm, meat=crossprod(u)/N)*dfcw
coeftest(fm, vcovCL) }
mclx <-
function(fm, dfcw, cluster1, cluster2){
library(sandwich)
library(lmtest)
cluster12 = paste(cluster1,cluster2, sep="")
M1 <- length(unique(cluster1))
M2 <- length(unique(cluster2))
M12 <- length(unique(cluster12))
N <- length(cluster1)
K <- fm$rank
dfc1 <- (M1/(M1-1))*((N-1)/(N-K))
dfc2 <- (M2/(M2-1))*((N-1)/(N-K))
dfc12 <- (M12/(M12-1))*((N-1)/(N-K))
u1 <- apply(estfun(fm), 2,
function(x) tapply(x, cluster1, sum))
u2 <- apply(estfun(fm), 2,
function(x) tapply(x, cluster2, sum))
u12 <- apply(estfun(fm), 2,
function(x) tapply(x, cluster12, sum))
vc1 <- dfc1*sandwich(fm, meat=crossprod(u1)/N )
vc2 <- dfc2*sandwich(fm, meat=crossprod(u2)/N )
vc12 <- dfc12*sandwich(fm, meat=crossprod(u12)/N)
vcovMCL <- (vc1 + vc2 - vc12)*dfcw
coeftest(fm, vcovMCL)}