Some useful functions for R

I build from time to time some small functions and make them available.

## Improved summary function, with weights, SD, SE, nb obs & quantiles

wtd.summary <- function (x,w=NULL,q=F) {
  ss <- NULL
  name <- paste(deparse(substitute(x)),collapse=" ")
  ss <- data.frame(name)
  ss$nb_obs <- length(x)
  if(!is.null(w)){
    ss$weight <- paste(deparse(substitute(w)),collapse=" ")
    ss$sum_wgt <- sum(w[is.na(x) ==F])
    ss$nb_missing <- sum(is.na(x))
    ss$sum_wgt_missing <- sum(w[is.na(x) ==T])
    ss$wtd_mean <- Hmisc::wtd.mean(x,w=w,na.rm=T)
    ss$wtd_sd <- Hmisc::wtd.var(x,w=w,na.rm=T)**0.5
    ss$ESS <- ss$sum_wgt^2/sum(w[is.na(x) ==F]^2)
    ss$wtd_se <- diagis::weighted_se(x,w=w,na.rm=T)
    ss$min <- min(x,na.rm=T)
    if(q==T){
      ss$p01 <- Hmisc::wtd.quantile(x,weights=w,probs=0.01,na.rm=T)
      ss$p05 <- Hmisc::wtd.quantile(x,weights=w,probs=0.05,na.rm=T)
      ss$p10 <- Hmisc::wtd.quantile(x,weights=w,probs=0.10,na.rm=T)
      ss$q1 <- Hmisc::wtd.quantile(x,weights=w,probs=0.25,na.rm=T)
      ss$q2 <- Hmisc::wtd.quantile(x,weights=w,probs=0.50,na.rm=T)
      ss$q3 <- Hmisc::wtd.quantile(x,weights=w,probs=0.75,na.rm=T)
      ss$p90 <- Hmisc::wtd.quantile(x,weights=w,probs=0.90,na.rm=T)
      ss$p95 <- Hmisc::wtd.quantile(x,weights=w,probs=0.95,na.rm=T)
      ss$p99 <- Hmisc::wtd.quantile(x,weights=w,probs=0.99,na.rm=T)
    }
    ss$max <- max(x,na.rm=T)
  } 
  else{
    ss$weight <- "Unweighted"
    ss$sum_wgt <- sum(1-is.na(x))
    ss$nb_missing <- sum(is.na(x))
    ss$sum_wgt_missing <- sum(is.na(x))
    ss$wtd_mean <- mean(x,na.rm=T)
    ss$wtd_sd <- var(x,na.rm=T)**0.5
    ss$ESS <- ss$sum_wgt^2/sum((1-is.na(x))^2)
    ss$wtd_se <- ss$wtd_sd/(ss$sum_wgt^0.5)
    ss$min <- min(x,na.rm=T)
    if(q==T){
      ss$p01 <- quantile(x,weights=w,probs=0.01,na.rm=T)
      ss$p05 <- quantile(x,weights=w,probs=0.05,na.rm=T)
      ss$p10 <- quantile(x,weights=w,probs=0.10,na.rm=T)
      ss$q1 <- quantile(x,weights=w,probs=0.25,na.rm=T)
      ss$q2 <- quantile(x,weights=w,probs=0.50,na.rm=T)
      ss$q3 <- quantile(x,weights=w,probs=0.75,na.rm=T)
      ss$p90 <- quantile(x,weights=w,probs=0.90,na.rm=T)
      ss$p95 <- quantile(x,weights=w,probs=0.95,na.rm=T)
      ss$p99 <- quantile(x,weights=w,probs=0.99,na.rm=T)
    }
    ss$max <- max(x,na.rm=T)
  }
  structure(ss)
}

## Lags and forwards

# Simplelag gets a variable value from a given line above. Forward gets the value from a given line below. It's inspired from the lag and forward function in SAS.

#mylag: number of lags

#myforward: number of forward

#by: enables to do a lag or of forward by a grouping variable

#outside: default value when lags or forwards are not available

simplelag <-function(x,by=NULL,mylag=1,outside=NA)
{
  myend<-length(x)- mylag
  if (!is.null(by)) {
    lby<-c(replicate(mylag,""),as.character(by[1:myend]))
    y0<-c(replicate(mylag,outside),x[1:myend])
    y<-ifelse(as.character(by)==lby,y0,outside)
    
  }
  else {
    y<-c(replicate(mylag,outside),x[1:myend])
  }
}


forward<-function(x,by=NULL,myforward=1,outside=NA)
{
  mystart<-myforward+1
  if (!is.null(by)) {
    fby<-c(as.character(by[mystart:length(x)]),replicate(myforward,""))
    y0<-c(x[mystart:length(x)],replicate(myforward,outside))
    y<-ifelse(as.character(by)==fby,y0,outside)
  }
  else {
    y<-c(x[mystart:length(x)],replicate(myforward,outside))
  }
}

#Example

#Value of d$x one row above
d$lag_x<-simplelag(d$x)

#Value of d$x three rows above, 0 in the two first rows
d$lag3_x<-simplelag(d$x,mylag=3, outside=0)

#Value of d$x two rows below
d$fwd2_x<-forward(d$x,myforward=2) 

#Value of d$x one row above if in the same group, 0 otherwise
d$lag_x<-simplelag(d$x,by=d$group,outside=0)

## Retain and obtain

Retain is the equivalent of the retain instruction in SAS. Obtain is the opposite of Retain.

retain<-function(x,event,outside=NA)
{
  indices <- c(1,which(event==TRUE), length(x)+1)
  values <- c(outside,x[event %in% TRUE])
  y<- rep(values, diff(indices)) 
}

obtain<-function(x,event,outside=NA)
{
  indices <- c(0,which(event==TRUE), length(x))
  values <- c(x[event %in% TRUE],outside)
  y<- rep(values, diff(indices)) 
}

#Example

#Retain in the following rows the last x where event =1
d$last_x<-retain(d$x,d$event==1)

#Obtain in the first rows the first x where event =2
d$first_x<-obtain(d$x,d$event==2)

## Winsoring

winsor<-function(x,tmax=0.99,tmin=0.01)
{
  y<-pmin(x,quantile(x,probs=tmax,na.rm=TRUE))
  y<-pmax(y,quantile(y,probs=tmin,na.rm=TRUE))
}

## Triming

trim <- function (x) gsub("^\\s+|\\s+$", "", x)

## Reversing a string

strReverse <- function(x) sapply(lapply(strsplit(x, NULL), rev), paste, collapse="")

## Clustering

# clx and mclx enable to robust-cluster standard errors in a regressions. Those functions are not mine. clx clusters according to one variable, mclx according to two variables.

clx <-
  function(fm, dfcw, cluster){
    library(sandwich)
    library(lmtest)
    M <- length(unique(cluster))
    N <- length(cluster)
    dfc <- (M/(M-1))*((N-1)/(N-fm$rank))
    u <- apply(estfun(fm),2,
               function(x) tapply(x, cluster, sum))
    vcovCL <- dfc*sandwich(fm, meat=crossprod(u)/N)*dfcw
    coeftest(fm, vcovCL) }

mclx <-
  function(fm, dfcw, cluster1, cluster2){
    library(sandwich)
    library(lmtest)
    cluster12 = paste(cluster1,cluster2, sep="")
    M1 <- length(unique(cluster1))
    M2 <- length(unique(cluster2))
    M12 <- length(unique(cluster12))
    N <- length(cluster1)
    K <- fm$rank
    dfc1 <- (M1/(M1-1))*((N-1)/(N-K))
    dfc2 <- (M2/(M2-1))*((N-1)/(N-K))
    dfc12 <- (M12/(M12-1))*((N-1)/(N-K))
    u1 <- apply(estfun(fm), 2,
                function(x) tapply(x, cluster1, sum))
    u2 <- apply(estfun(fm), 2,
                function(x) tapply(x, cluster2, sum))
    u12 <- apply(estfun(fm), 2,
                 function(x) tapply(x, cluster12, sum))
    vc1 <- dfc1*sandwich(fm, meat=crossprod(u1)/N )
    vc2 <- dfc2*sandwich(fm, meat=crossprod(u2)/N )
    vc12 <- dfc12*sandwich(fm, meat=crossprod(u12)/N)
    vcovMCL <- (vc1 + vc2 - vc12)*dfcw
    coeftest(fm, vcovMCL)}