Je construis parfois des petites fonctions qui peuvent être utiles.
## Improved summary function, with weights, SD, SE, nb obs & quantiles
wtd.summary <- function (x,w=NULL,q=F) {
ss <- NULL
name <- paste(deparse(substitute(x)),collapse=" ")
ss <- data.frame(name)
ss$nb_obs <- length(x)
if(!is.null(w)){
ss$weight <- paste(deparse(substitute(w)),collapse=" ")
ss$sum_wgt <- sum(w[is.na(x) ==F])
ss$nb_missing <- sum(is.na(x))
ss$sum_wgt_missing <- sum(w[is.na(x) ==T])
ss$wtd_mean <- Hmisc::wtd.mean(x,w=w,na.rm=T)
ss$wtd_sd <- Hmisc::wtd.var(x,w=w,na.rm=T)**0.5
ss$ESS <- ss$sum_wgt^2/sum(w[is.na(x) ==F]^2)
ss$wtd_se <- diagis::weighted_se(x,w=w,na.rm=T)
ss$min <- min(x,na.rm=T)
if(q==T){
ss$p01 <- Hmisc::wtd.quantile(x,weights=w,probs=0.01,na.rm=T)
ss$p05 <- Hmisc::wtd.quantile(x,weights=w,probs=0.05,na.rm=T)
ss$p10 <- Hmisc::wtd.quantile(x,weights=w,probs=0.10,na.rm=T)
ss$q1 <- Hmisc::wtd.quantile(x,weights=w,probs=0.25,na.rm=T)
ss$q2 <- Hmisc::wtd.quantile(x,weights=w,probs=0.50,na.rm=T)
ss$q3 <- Hmisc::wtd.quantile(x,weights=w,probs=0.75,na.rm=T)
ss$p90 <- Hmisc::wtd.quantile(x,weights=w,probs=0.90,na.rm=T)
ss$p95 <- Hmisc::wtd.quantile(x,weights=w,probs=0.95,na.rm=T)
ss$p99 <- Hmisc::wtd.quantile(x,weights=w,probs=0.99,na.rm=T)
}
ss$max <- max(x,na.rm=T)
}
else{
ss$weight <- "Unweighted"
ss$sum_wgt <- sum(1-is.na(x))
ss$nb_missing <- sum(is.na(x))
ss$sum_wgt_missing <- sum(is.na(x))
ss$wtd_mean <- mean(x,na.rm=T)
ss$wtd_sd <- var(x,na.rm=T)**0.5
ss$ESS <- ss$sum_wgt^2/sum((1-is.na(x))^2)
ss$wtd_se <- ss$wtd_sd/(ss$sum_wgt^0.5)
ss$min <- min(x,na.rm=T)
if(q==T){
ss$p01 <- quantile(x,weights=w,probs=0.01,na.rm=T)
ss$p05 <- quantile(x,weights=w,probs=0.05,na.rm=T)
ss$p10 <- quantile(x,weights=w,probs=0.10,na.rm=T)
ss$q1 <- quantile(x,weights=w,probs=0.25,na.rm=T)
ss$q2 <- quantile(x,weights=w,probs=0.50,na.rm=T)
ss$q3 <- quantile(x,weights=w,probs=0.75,na.rm=T)
ss$p90 <- quantile(x,weights=w,probs=0.90,na.rm=T)
ss$p95 <- quantile(x,weights=w,probs=0.95,na.rm=T)
ss$p99 <- quantile(x,weights=w,probs=0.99,na.rm=T)
}
ss$max <- max(x,na.rm=T)
}
structure(ss)
}
## Lags and forwards
# Simplelag gets a variable value from a given line above. Forward gets the value from a given line below. It's inspired from the lag and forward function in SAS.
#mylag: number of lags
#myforward: number of forward
#by: enables to do a lag or of forward by a grouping variable
#outside: default value when lags or forwards are not available
simplelag <-function(x,by=NULL,mylag=1,outside=NA)
{
myend<-length(x)- mylag
if (!is.null(by)) {
lby<-c(replicate(mylag,""),as.character(by[1:myend]))
y0<-c(replicate(mylag,outside),x[1:myend])
y<-ifelse(as.character(by)==lby,y0,outside)
}
else {
y<-c(replicate(mylag,outside),x[1:myend])
}
}
forward<-function(x,by=NULL,myforward=1,outside=NA)
{
mystart<-myforward+1
if (!is.null(by)) {
fby<-c(as.character(by[mystart:length(x)]),replicate(myforward,""))
y0<-c(x[mystart:length(x)],replicate(myforward,outside))
y<-ifelse(as.character(by)==fby,y0,outside)
}
else {
y<-c(x[mystart:length(x)],replicate(myforward,outside))
}
}
#Example
#Value of d$x one row above
d$lag_x<-simplelag(d$x)
#Value of d$x three rows above, 0 in the two first rows
d$lag3_x<-simplelag(d$x,mylag=3, outside=0)
#Value of d$x two rows below
d$fwd2_x<-forward(d$x,myforward=2)
#Value of d$x one row above if in the same group, 0 otherwise
d$lag_x<-simplelag(d$x,by=d$group,outside=0)
## Retain and obtain
Retain is the equivalent of the retain instruction in SAS. Obtain is the opposite of Retain.
retain<-function(x,event,outside=NA)
{
indices <- c(1,which(event==TRUE), length(x)+1)
values <- c(outside,x[event %in% TRUE])
y<- rep(values, diff(indices))
}
obtain<-function(x,event,outside=NA)
{
indices <- c(0,which(event==TRUE), length(x))
values <- c(x[event %in% TRUE],outside)
y<- rep(values, diff(indices))
}
#Example
#Retain in the following rows the last x where event =1
d$last_x<-retain(d$x,d$event==1)
#Obtain in the first rows the first x where event =2
d$first_x<-obtain(d$x,d$event==2)
## Winsoring
winsor<-function(x,tmax=0.99,tmin=0.01)
{
y<-pmin(x,quantile(x,probs=tmax,na.rm=TRUE))
y<-pmax(y,quantile(y,probs=tmin,na.rm=TRUE))
}
## Triming
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
## Reversing a string
strReverse <- function(x) sapply(lapply(strsplit(x, NULL), rev), paste, collapse="")
## Clustering
# clx and mclx enable to robust-cluster standard errors in a regressions. Those functions are not mine. clx clusters according to one variable, mclx according to two variables.
clx <-
function(fm, dfcw, cluster){
library(sandwich)
library(lmtest)
M <- length(unique(cluster))
N <- length(cluster)
dfc <- (M/(M-1))*((N-1)/(N-fm$rank))
u <- apply(estfun(fm),2,
function(x) tapply(x, cluster, sum))
vcovCL <- dfc*sandwich(fm, meat=crossprod(u)/N)*dfcw
coeftest(fm, vcovCL) }
mclx <-
function(fm, dfcw, cluster1, cluster2){
library(sandwich)
library(lmtest)
cluster12 = paste(cluster1,cluster2, sep="")
M1 <- length(unique(cluster1))
M2 <- length(unique(cluster2))
M12 <- length(unique(cluster12))
N <- length(cluster1)
K <- fm$rank
dfc1 <- (M1/(M1-1))*((N-1)/(N-K))
dfc2 <- (M2/(M2-1))*((N-1)/(N-K))
dfc12 <- (M12/(M12-1))*((N-1)/(N-K))
u1 <- apply(estfun(fm), 2,
function(x) tapply(x, cluster1, sum))
u2 <- apply(estfun(fm), 2,
function(x) tapply(x, cluster2, sum))
u12 <- apply(estfun(fm), 2,
function(x) tapply(x, cluster12, sum))
vc1 <- dfc1*sandwich(fm, meat=crossprod(u1)/N )
vc2 <- dfc2*sandwich(fm, meat=crossprod(u2)/N )
vc12 <- dfc12*sandwich(fm, meat=crossprod(u12)/N)
vcovMCL <- (vc1 + vc2 - vc12)*dfcw
coeftest(fm, vcovMCL)}