#__________________________________________ # Library and upload #### #__________________________________________ # install.packages("RecordLinkage") # install.packages("plyr") # install.packages("stringi") # install.packages("data.table") library(RecordLinkage) library("plyr",include.only = "rbind.fill") library("dplyr") library("stringi") library("stringr") library("data.table") qq1 <- NULL qq2 <- NULL qq3 <- NULL qq4 <- NULL qq5 <- NULL qq6 <- NULL setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) aa <- readRDS("base_candidatEs_public.rds") # Export base candidats #### # aa <- aa[,-which(names(aa) %in% c("nom_std","prenom_std","nom_prenom_std","epouse_std","nee_std","nom_dit_std","prenom_dit_std","scrutin_det"))] # str(aa) # saveRDS(aa,"base_candidats_public.rds") # write.csv(aa,"base_candidats_public.csv",row.names=F,na='') mycol <- colnames(aa) mycol2 <- list(mycol) str(mycol2) str(aa) nrow(aa) aa <- data.table(aa) aa[,c("id_election_candidat","id_personne","id_personne_long"):=NULL] # aa <- aa[,..mycol] aa$id_election_candidat_long <- as.double(aa$id_election_candidat_long) str(aa) #_______________________________________________________# # Standardization of names (optional) #### #_______________________________________________________# # Switch to TRUE to rerun name standardization nom_prenom_std <- TRUE # nom_prenom_std <- TRUE if (nom_prenom_std ==TRUE){ aa$nom_prenom_std_bak <- aa$nom_prenom_std # aa$nom_prenom_std <- aa$nom_prenom_std_bak aa$nom_prenom_std <- NULL ## Cleaning Nom #### aa$nom_std <- str_to_lower(aa$nom) aa$nom_std <- stri_trans_general(aa$nom_std,id = "Latin-ASCII") aa$nom_std <- str_replace(aa$nom_std, "\\-", " ") aa$nom_std <- str_replace(aa$nom_std, "\\.", " ") aa$nom_std <- str_replace(aa$nom_std, "'", " ") aa$nom_std <- str_replace(aa$nom_std, "\\?", " ") aa$nom_std <- str_replace(aa$nom_std, ";", " ") aa$nom_std <- str_replace(aa$nom_std, ",", " ") aa$nom_std <- str_replace(aa$nom_std, "'", " ") ## Delete spouse name in parenthesis or ep. or epouse #### aa$nom_std <- sub("\\(.*\\)", "", aa$nom_std) aa$epouse_std <- ifelse(regexpr(" ep ",aa$nom_std) > 0, substr(aa$nom_std,regexpr(" ep ",aa$nom_std)+4,nchar(aa$nom_std)), NA) aa$nom_std <- ifelse(regexpr(" ep ",aa$nom_std) > 0, substr(aa$nom_std,1,regexpr(" ep ",aa$nom_std)), aa$nom_std) aa$epouse_std <- ifelse(regexpr(" epouse ",aa$nom_std) > 0, substr(aa$nom_std,regexpr(" epouse ",aa$nom_std)+8, nchar(aa$nom_std)),aa$epouse_std) aa$nom_std <- ifelse(regexpr(" epouse ",aa$nom_std) > 0, substr(aa$nom_std,1,regexpr(" epouse ",aa$nom_std)), aa$nom_std) ## Dit #### aa$nom_dit_std <- ifelse(regexpr(" dit ",aa$nom_std) > 0, substr(aa$nom_std,regexpr(" dit ",aa$nom_std)+5,nchar(aa$nom_std)), NA) aa$nom_std <- ifelse(regexpr(" dit ",aa$nom_std) > 0, substr(aa$nom_std,1,regexpr(" dit ",aa$nom_std)), aa$nom_std) aa$nom_dit_std <- ifelse(regexpr(" dite ",aa$nom_std) > 0, substr(aa$nom_std,regexpr(" dite ",aa$nom_std)+6,nchar(aa$nom_std)), aa$nom_dit_std) aa$nom_std <- ifelse(regexpr(" dite ",aa$nom_std) > 0, substr(aa$nom_std,1,regexpr(" dite ",aa$nom_std)), aa$nom_std) aa$nom_std <- sub("<.*<", "", aa$nom_std) aa$nom_std <- str_replace(aa$nom_std, "<", " ") ## Nee #### aa$nee_std <- ifelse(regexpr(" nee ",aa$nom_std) > 0, substr(aa$nom_std,regexpr(" nee ",aa$nom_std)+5, nchar(aa$nom_std)),NA) aa$nom_std <- ifelse(regexpr(" nee ",aa$nom_std) > 0, substr(aa$nom_std,1,regexpr(" nee ",aa$nom_std)), aa$nom_std) ## Removing multiple white spaces #### aa$nom_std <- str_squish(aa$nom_std) ## Triming #### aa$nom_std <- str_trim(aa$nom_std) ## Cleaning Prenom #### aa$prenom_std <- str_to_lower(aa$prenom) aa$prenom_std <- stri_trans_general(aa$prenom_std,id = "Latin-ASCII") aa$prenom_std <- ifelse(aa$sexe %in% "M",str_replace(aa$prenom_std, "j\\.", "jean "),aa$prenom_std) aa$prenom_std <- ifelse(aa$sexe %in% "M",str_replace(aa$prenom_std, "f\\.", "francois "),aa$prenom_std) aa$prenom_std <- ifelse(aa$sexe %in% "M",str_replace(aa$prenom_std, "m\\.", "marc "),aa$prenom_std) aa$prenom_std <- ifelse(aa$sexe %in% "M",str_replace(aa$prenom_std, "p\\.", "pierre "),aa$prenom_std) aa$prenom_std <- ifelse(aa$sexe %in% "M",str_replace(aa$prenom_std, "y\\.", "yves "),aa$prenom_std) aa$prenom_std <- ifelse(aa$sexe %in% "F",str_replace(aa$prenom_std, "a\\.", "anne "),aa$prenom_std) aa$prenom_std <- ifelse(aa$sexe %in% "F",str_replace(aa$prenom_std, "m\\.", "marie "),aa$prenom_std) aa$prenom_std <- str_replace(aa$prenom_std, "\\-", " ") aa$prenom_std <- str_replace(aa$prenom_std, "\\.", " ") aa$prenom_std <- str_replace(aa$prenom_std, "\\?", " ") aa$prenom_std <- str_replace(aa$prenom_std, ";", " ") aa$prenom_std <- str_replace(aa$prenom_std, ",", " ") aa$prenom_std <- str_replace(aa$prenom_std, "'", " ") ## Prenom Dit #### aa$prenom_dit_std <- ifelse(regexpr(" dit ",aa$prenom_std) > 0, substr(aa$prenom_std,regexpr(" dit ",aa$prenom_std)+5, nchar(aa$prenom_std)),NA) aa$prenom_std <- ifelse(regexpr(" dit ",aa$prenom_std) > 0, substr(aa$prenom_std,1,regexpr(" dit ",aa$prenom_std)), aa$prenom_std) aa$prenom_dit_std <- ifelse(regexpr(" dite ",aa$prenom_std) > 0, substr(aa$prenom_std,regexpr(" dite ",aa$prenom_std)+6,nchar(aa$prenom_std)), aa$prenom_dit_std) aa$prenom_std <- ifelse(regexpr(" dite ",aa$prenom_std) > 0, substr(aa$prenom_std,1,regexpr(" dite ",aa$prenom_std)), aa$prenom_std) aa$prenom_std <- sub("<.*<", "", aa$prenom_std) aa$prenom_std <- str_replace(aa$prenom_std, "<", " ") aa$prenom_std <- sub("\\(.*\\)", "", aa$prenom_std) #Delete other firstname in parenthesis aa$prenom_std <- str_squish(aa$prenom_std) aa$prenom_std <- str_trim(aa$prenom_std) ## nom_prenom_std #### aa$nom_prenom_std <- str_trim(paste(aa$nom_std,aa$prenom_std,sep=" ")) ## Error name fuplicate in the cantonale 1998 #### aa$nb_dup_dep_election <- ave(aa$count, paste(aa$nom_prenom_std,aa$code_departement,aa$election_an_tour),FUN=sum) table(aa$election_an,aa$nb_dup_dep_election) aa$nom_prenom_std <- ifelse(aa$nb_dup_dep_election > 2 & aa$election_an=="cantonale 1998", paste(aa$nom_prenom_std,"Erreur Nom",aa$id_election_candidat_long),aa$nom_prenom_std) aa$nb_dup_dep_election <- NULL # Changes table(aa$nom_prenom_std_bak == aa$nom_prenom_std) } aa$count <- 1 aa <- aa[order(aa$nom_prenom_std, -is.na(aa$date_naissance)*1, -aa$date_naissance, aa$date, -aa$election),] aa$id_election_candidat <- as.double(cumsum(aa$count) + 10^7) rownames(aa) <- NULL #_____________________________________________________________# # Matching the two rounds of different elections #### #_____________________________________________________________# aa$election_bak <- aa$election aa$election_an_bak <- aa$election_an aa$election_an_tour_bak <- aa$election_an_tour aa$election[aa$election %in% c("municipale (petites communes)","maire")] <- "municipale" aa$election_an <- paste(aa$election, aa$annee) aa$election_an_tour <- paste(aa$election,aa$annee, aa$tour) ## Rounds matching function #### aa$code_unite_election_an <- paste(aa$code_unite,aa$election_an) gc() rnd_match <- function (idin="",idout="") { aa[,which(names(aa) %in% c(idout))] <- NULL aa$my_fid_1 <- aa[[idin]] # New id as the max of id by unit, with creation of a new aa[,`:=` (my_fid=max(my_fid_1)),by=paste(key,code_unite_election_an)] # Rematching the ids aa[,`:=` (my_fid1=max(my_fid)),by=my_fid_1] # Number of initial matches print("Number of initial matches") print(table(1-(aa$my_fid==aa$my_fid_1))) # Number of rematches after looping on previous ids print("Number of rematches after looping on previous ids") print(table(1-(aa$my_fid1==aa$my_fid_1))) aa$my_fid <- aa$my_fid1 # Check if identification creates duplicates aa[,`:=` (nb_pb=sum(count)),by=paste(my_fid,election_an_tour,date_debut_ministere)] aa[,`:=` (max_nb_pb=max(nb_pb)),by=paste(my_fid)] print("Number of identifications with different application for the same election at the same date (pb >1)") print(table(aa$max_nb_pb)) # Already matched aa[,`:=` (match_unit_1=sum(count)),by=paste(my_fid_1,code_unite_election_an)] # Not matched aa[,`:=` (match_unit=sum(count)),by=paste(my_fid,code_unite_election_an)] aa$my_fid <- ifelse(aa$max_nb_pb>1 | aa$match_unit_1==2 | aa$match_unit==1,aa$my_fid_1,aa$my_fid) # View(aa[aa$max_nb_pb>1,c("nom_prenom_std","my_fid","election_an_tour","date_debut_ministere","code_unite")]) # Correction of duplicated identification by date birth aa[,`:=` (nb_my_fid_1_nais=sum(count)),by=paste(my_fid,date_naissance)] aa[,`:=` (nb_my_fid_1_nais_dif=sum(ifelse(is.na(date_naissance)==T,0,1/nb_my_fid_1_nais))),by=my_fid] print("Number of identification with different birth dates (pb >1)") print(table(aa$nb_my_fid_1_nais_dif)) # Final Identification aa$my_fid <- ifelse(aa$nb_my_fid_1_nais_dif>1,aa$my_fid_1,aa$my_fid) #Quality check qq1_fid <- table(1-(aa$my_fid != (aa$my_fid_1)) ) print("Number of changes in Ids") print(qq1_fid) aa$my_fid <- aa$my_fid aa_t1 <- aa[aa$tour==1 & aa$nb_tour==2,c("my_fid","tour","election_an")] aa_t1$tour <- 2 aa_t1$match_t1 <- 1 aa_t2 <- aa[aa$tour==2,c("my_fid","tour","election_an")] aa_t2t1 <- merge(aa_t2,aa_t1,by=c("my_fid","election_an","tour"),all.x=T) qq2_fid <- table(1-is.na(aa_t2t1$match_t1)) qq3_fid <- table(aa_t2t1$election_an,1-is.na(aa_t2t1$match_t1)) print("Number of second rounds foudn in first round") print(qq2) print("Number of second rounds foudn in first round by election") print(qq3_fid) nrow(aa_t2) nrow(aa_t2t1) aa[,c("nb_my_fid_1_nais","nb_pb","max_nb_pb","nb_my_fid_1_nais_dif","match_unit","match_unit_1"):=NULL] qq1_fid <- data.frame(qq1_fid) colnames(qq1_fid) <- c("Var","Freq") qq1_fid$id <- idout qq2_fid <- data.frame(qq2_fid) colnames(qq2_fid) <- c("Var","Freq") qq2_fid$id <- idout qq3_fid <- data.frame(qq3_fid) colnames(qq3_fid) <- c("Var1","Var2","Freq") qq3_fid$id <- idout qq1 <<- rbind.fill(qq1,qq1_fid) qq2 <<- rbind.fill(qq2,qq2_fid) qq3 <<- rbind.fill(qq3,qq3_fid) qq4_fid <- length(table(aa$my_fid)) qq4_fid <- data.frame(qq4_fid) colnames(qq4_fid) <- c("n") qq4_fid$id <- idout qq4 <- rbind(qq4,qq4_fid) colnames(aa)[which(names(aa) %in% c("my_fid"))] <- idout aa[,which(names(aa) %in% c(paste(idin,"_1",sep="")))] <- NULL aa[,which(names(aa) %in% c(paste(idout,"1",sep="")))] <- NULL aa[,c("my_fid_1","my_fid1"):=NULL] aa <<- aa qq4 <<- qq4 print("Number of ids") print(idout) print(qq4_fid) } #_____________________________________________________________ ## 2 Chaining on code_unite_election_an & nom_prenom_std & date_naissance #### #_____________________________________________________________ aa$key <- paste(aa$nom_prenom_std,aa$date_naissance) rnd_match(idin="id_election_candidat",idout="my_id2a") colnames(aa) #_____________________________________________________________ ## 2b Chaining on code_unite_election_an & nom_prenom_std #### #_____________________________________________________________ # When there's no visible homonyms in any election in the commune # and there's no more than one birth date in the commune # we apply the same id to all nom_prenom_std str(aa) aa$key <- aa$nom_prenom_std rnd_match(idin="my_id2a",idout="my_id2b") #_____________________________________________________________ ## 2c Chaining on code_unite_election_an, nom_std, sexe & date_naissance #### #_____________________________________________________________ aa$key <- paste(aa$nom_std,aa$sexe,aa$date_naissance) rnd_match(idin="my_id2b",idout="my_id2c") #_____________________________________________________________ ## 2d Chaining on code_unite_election_an, nom_std, sexe #### #____________________________________________________________ aa$key <- paste(aa$nom_std,aa$sexe) rnd_match(idin="my_id2c",idout="my_id2d") #____________________________________________________________ ## 2e Chaining on code_unite_election_an, prenom_std, sexe, nuance, date_naissance, #### #_____________________________________________________________ aa$key <- paste(aa$prenom_std,aa$sexe,aa$date_naissance,aa$nuance) rnd_match(idin="my_id2d",idout="my_id2e") #_____________________________________________________________ ## 2f Chaining on code_unite_election_an, prenom_std, sexe, nuance #### #_____________________________________________________________ aa$key <- paste(aa$prenom_std,aa$sexe,aa$nuance) rnd_match(idin="my_id2e",idout="my_id2f") #_____________________________________________________________ ## 2g Chaining on code_unite_election_an, prenom_std, sexe, nom_tete_liste, date_naissance #### #_____________________________________________________________ aa$key <- paste(aa$prenom_std,aa$sexe,aa$date_naissance,tolower(aa$nom_tete_liste)) rnd_match(idin="my_id2f",idout="my_id2g") #_____________________________________________________________ ## 2h Chaining on code_unite_election_an, prenom_std, sexe, nom_tete_liste #### #_____________________________________________________________ aa$key <- paste(aa$prenom_std,aa$sexe,tolower(aa$nom_tete_liste)) rnd_match(idin="my_id2g",idout="my_id2h") #_____________________________________________________________ ## 2i Chaining on code_unite_election_an, prenom_std, sexe #### #_____________________________________________________________ aa$key <- paste(aa$prenom_std,aa$sexe) rnd_match(idin="my_id2h",idout="my_id2i") #_____________________________________________________________ # Cleaning of round 1-2 matches #### #_____________________________________________________________ # aa[,c("id_election_candidat_t2","name_cor_ok.x","nom_prenom_std_cor","nom_std_cor","prenom_std_cor","elu_t2","date_naissance_cor2","t2","t2.y","name_cor_ok.y"):=NULL] str(aa) gc() aa$date_naissance_cor <- aa$date_naissance aa_t2 <- aa[aa$tour==2,c("my_id2i","tour","election_an","id_election_candidat","nom_prenom_std","nom_std","prenom_std","elu_ce_tour","date_naissance_cor")] aa_t2$tour <- 1 aa_t2$t2 <- 1 colnames(aa_t2)[4:9] <- c("id_election_candidat_t2","nom_prenom_std_cor","nom_std_cor","prenom_std_cor","elu_t2","date_naissance_cor2") str(aa) str(aa_t2) aa <- merge(aa,aa_t2,by=c("my_id2i","election_an","tour"),all.x=T) nrow(aa) aa$nom_prenom_std_cor <- ifelse(is.na(aa$nom_prenom_std_cor)==T,aa$nom_prenom_std,aa$nom_prenom_std_cor) aa$nom_std_cor <- ifelse(is.na(aa$nom_std_cor)==T,aa$nom_std,aa$nom_std_cor) aa$prenom_std_cor <- ifelse(is.na(aa$prenom_std_cor)==T,aa$prenom_std,aa$prenom_std_cor) ## Name match correction cleaning #### # We check if first names match do not lead to too distant last names check_name <- aa[aa$nom_std_cor != aa$nom_std,c("my_id2i","election_an","code_unite","nom_std","nom_std_cor","sexe","nee_std","nom_dit_std")] check_name$prox <- levenshteinSim(check_name$nom_std,check_name$nom_std_cor) check_name$prox2 <- (substr(check_name$nom_std,1,3)==substr(check_name$nom_std_cor,1,3))*1 check_name$name_cor_ok <- (check_name$prox>0.5 | check_name$prox2==1)*1 table(check_name$name_cor_ok) aa <- merge(aa,check_name[,c("my_id2i","name_cor_ok")],by="my_id2i",all.x=T) table(aa$name_cor_ok) aa$date_naissance_cor2 <- ifelse(aa$name_cor_ok %in% 0,NA,aa$date_naissance_cor2) aa$my_id2j <- ifelse(aa$name_cor_ok %in% 0,aa$my_id2d,aa$my_id2i) aa$nom_prenom_std_cor <- ifelse(aa$name_cor_ok %in% 0,aa$nom_prenom_std,aa$nom_prenom_std_cor) aa$nom_std_cor <- ifelse(aa$name_cor_ok %in% 0,aa$nom_std,aa$nom_std_cor) aa$prenom_std_cor <- ifelse(aa$name_cor_ok %in% 0,aa$prenom_std,aa$prenom_std_cor) aa$id_election_candidat_t2 <- ifelse(aa$name_cor_ok %in% 0,NA,aa$id_election_candidat_t2) aa$t2 <- ifelse(aa$name_cor_ok %in% 0,NA,aa$t2) aa$elu_t2 <- ifelse(aa$name_cor_ok %in% 0,NA,aa$elu_t2) aa$date_naissance_cor <- ifelse(is.na(aa$date_naissance_cor)==T,aa$date_naissance_cor2,aa$date_naissance_cor) aa$date_naissance_cor2 <- NULL ## Match t2 & t1 #### aa$t2 <- ifelse(aa$tour==1 & aa$nb_tour==2 & is.na(aa$t2)==T,0,aa$t2) table(aa$t2) aa_t1 <- aa[aa$tour==1 & aa$nb_tour==2,c("my_id2j","tour","election_an")] aa_t1$tour <- 2 aa_t1$match_t1 <- 1 aa_t2 <- aa[aa$tour==2,c("my_id2j","tour","election_an")] aa_t2t1 <- merge(aa_t2,aa_t1,by=c("my_id2j","election_an","tour"),all.x=T) qq2_id2j <- table(1-(is.na(aa_t2t1$match_t1))) qq3_id2j <- table(aa_t2t1$election_an,is.na(aa_t2t1$match_t1)) nrow(aa_t2) nrow(aa_t2t1) qq2_id2j <- data.frame(qq2_id2j) colnames(qq2_id2j) <- c("Var","Freq") qq2_id2j$id <- "id2j" qq3_id2j <- data.frame(qq3_id2j) colnames(qq3_id2j) <- c("Var1","Var2","Freq") qq3_id2j$id <- "id2j" qq2 <- rbind.fill(qq2,qq2_id2j) qq3 <- rbind.fill(qq3,qq3_id2j) qq4_fid <- length(table(aa$my_id2j)) print(qq4_fid) qq4_fid <- data.frame(qq4_fid) colnames(qq4_fid) <- c("n") qq4_fid$id <- "my_id2j" qq4 <- rbind(qq4,qq4_fid) #------------------------------# ## The 2 rounds #### #------------------------------# aa_t1 <- aa[aa$tour==1 & aa$nb_tour==2,c("my_id2j","tour","election_an","id_election_candidat","elu_ce_tour")] aa_t1$tour <- 2 aa_t1$t1 <- 1 colnames(aa_t1)[c(4:5)] <- c("id_election_candidat_t1","elu_t1") aa <- merge(aa,aa_t1,by=c("my_id2j","election_an","tour"),all.x=T) nrow(aa) aa$t1 <- ifelse(aa$tour==2 & is.na(aa$t1)==T,0,aa$t1) table(aa$t1) ss_t2 <- aa[aa$tour==2 & aa$t1==1,] nrow(ss_t2) aa <- aa[(aa$tour==2 & aa$t1==1)==F,] nrow(aa) # nrow(bb)+nrow(ss_t2) # rm(bb) aa[,which(substr(names(aa),1,5) %in% "my_id"):= NULL] ss_t2[,which(substr(names(ss_t2),1,5) %in% "my_id"):= NULL] str(ss_t2) nrow(ss_t2) nrow(aa) nrow(aa)+nrow(ss_t2) rm(aa_t1,aa_t2,aa_t2t1) gc() rm(nn,nn_m) str(aa) # saveRDS(aa,"aa_tmp.RDS") # saveRDS(ss_t2,"ss_t2_tmp.RDS") # ss_t2 <- readRDS("ss_t2_tmp.RDS") # aa <- readRDS("aa_tmp.RDS") #_____________________________________________________________ # 3 Base chaining #### #_____________________________________________________________ aa <- aa[order(aa$id_election_candidat),] rownames(aa) <- NULL aa$count <- 1 str(aa) set.seed(5555) aa$epsilon <- round(runif(nrow(aa))/1000,10) ## Chaining function #### # Not totally necessary # But enables to prioritize id of already matched # (Makes a small difference in the last rounds) alpha <- 10^8 beta <- 10^10 # When there's no visible homonyms in any election in the unit # and there's no more than one birth date in the unit # we apply the same id to all key id_match <- function (idin="",idout="") { aa[,which(names(aa) %in% c(idout))] <- NULL # Identification aa$my_fid_1 <- aa[[idin]] # Initial match aa[,`:=` (my_fid=max(alpha+my_fid_1)),by=key] # Relooping on previous ids aa[,`:=` (my_fid1=max(my_fid)),by=my_fid_1] # Change ids print("Initial id changes") print(table(1-(aa$my_fid==alpha+aa$my_fid_1))) print("Id changes after relooping on previous ids") print(table(1-(aa$my_fid1==alpha+aa$my_fid_1))) aa$my_fid <- aa$my_fid1 # checking number of identifications aa[,`:=` (nb_id_1=sum(count+epsilon)),by=paste(my_fid_1)] aa[,`:=` (nb_id=sum(count+epsilon)),by=paste(my_fid)] print("Change in the number of obs to which an id is matched (!=0)") print(table(round(aa$nb_id-aa$nb_id_1))) # Correction of duplicated identification by election aa[,`:=` (nb_pb=sum(count)),by=paste(my_fid,election_an_tour,date_debut_ministere)] aa[,`:=` (max_nb_pb=max(nb_pb)),by=paste(my_fid)] qq5_fid <- table(aa$max_nb_pb) print("Number of identifications with different application for the same election at the same date (pb >1)") print(qq5_fid) aa$my_fid <- ifelse(aa$max_nb_pb>1 | (aa$nb_id-aa$nb_id_1)==0,aa$my_fid_1,aa$my_fid) # Correction of duplicated identification by date birth aa[,`:=` (nb_my_id_nais=sum(count)),by=paste(my_fid,date_naissance_cor)] aa[,`:=` (nb_my_id_nais_dif=sum(ifelse(is.na(date_naissance_cor)==T,0,1/nb_my_id_nais))),by=my_fid] qq6_fid <- table(aa$nb_my_id_nais_dif) print("Number of identification with different birth dates (pb >1)") print(qq6_fid) # Final identification aa$my_fid <- ifelse(aa$nb_my_id_nais_dif>1,aa$my_fid_1,aa$my_fid) # Quality of identification qq2_fid <- table(1- ((aa$my_fid!=aa$my_fid_1+alpha) & (aa$my_fid!=aa$my_fid_1))) print(qq2_fid) qq2_fid <- data.frame(qq2_fid) colnames(qq2_fid) <- c("Var","Freq") qq2_fid$id <- idout aa[,c("nb_my_id_nais","nb_pb","max_nb_pb","nb_my_id_nais_dif","nb_id","nb_id_1"):=NULL] qq4_fid <- length(table(aa$my_fid)) qq4_fid <- data.frame(qq4_fid) colnames(qq4_fid) <- c("n") qq4_fid$id <- idout qq4 <<- rbind(qq4,qq4_fid) qq5_fid <- data.frame(qq5_fid) colnames(qq5_fid) <- c("Var","Freq") qq5_fid$id <- idout qq5 <<- rbind(qq5,qq5_fid) qq6_fid <- data.frame(qq6_fid) colnames(qq6_fid) <- c("Var","Freq") qq6_fid$id <- idout qq6 <<- rbind(qq6,qq6_fid) aa$my_fid <- aa$my_fid + beta names(aa) <- gsub("my_fid",idout,names(aa)) aa[,which(names(aa) %in% c(paste(idin,"_1",sep="")))] <- NULL aa[,which(names(aa) %in% c(paste(idout,"1",sep="")))] <- NULL aa[,c("my_fid_1","my_fid1"):=NULL] aa <<- aa print("Number of ids") print(idout) print(qq4_fid) } #_____________________________________________________________ ## 3a Chaining on nom_prenom_std & date_naissance #### #_____________________________________________________________ str(aa) aa$key <- paste(aa$nom_prenom_std,aa$date_naissance) id_match(idin="id_election_candidat",idout="my_id3a") #_____________________________________________________________ ## 3b Chaining on nom_prenom_std & date_naissance_cor #### #_____________________________________________________________ aa$key <- paste(aa$nom_prenom_std,aa$date_naissance_cor) id_match(idin="my_id3a",idout="my_id3b") #_____________________________________________________________ ## 3c Chaining on nom_prenom_std #### #_____________________________________________________________ aa$key <- aa$nom_prenom_std id_match(idin="my_id3b",idout="my_id3c") #_____________________________________________________________ # 4 Geographical Chaining #### #_____________________________________________________________ #_____________________________________________________________ ## 4a Chaining on code_unite & nom_prenom_std #### #_____________________________________________________________ aa$key <- paste(aa$nom_prenom_std,aa$code_unite) id_match(idin="my_id3c",idout="my_id4a") #_____________________________________________________________ ## 4b Chaining on code_commune & nom_prenom_std #### #_____________________________________________________________ aa$code_commune_bis <- ifelse(is.na(aa$code_commune),aa$id_election_candidat,aa$code_commune) aa$key <- paste(aa$nom_prenom_std,aa$code_commune_bis) id_match(idin="my_id4a",idout="my_id4b") aa$code_commune_bis <- NULL #_____________________________________________________________ ## 4cd Chaining on code_departement & nom_prenom_std #### #_____________________________________________________________ aa$code_departement_bis <- ifelse(is.na(aa$code_departement),aa$id_election_candidat,aa$code_departement) aa$key <- paste(aa$nom_prenom_std,aa$election, aa$code_departement_bis) id_match(idin="my_id4b",idout="my_id4c") aa$key <- paste(aa$nom_prenom_std,aa$code_departement_bis) id_match(idin="my_id4c",idout="my_id4d") aa$code_departement_bis <- NULL qq4 #_____________________________________________________________ ## 4ef Chaining on code_region_2015 & nom_prenom_std #### #_____________________________________________________________ aa$code_region_2015_bis <- ifelse(is.na(aa$code_region_2015),aa$id_election_candidat,aa$code_region_2015) aa$key <- paste(aa$nom_prenom_std,aa$election,aa$code_departement_bis) id_match(idin="my_id4d",idout="my_id4e") aa$key <- paste(aa$nom_prenom_std,aa$code_departement_bis) id_match(idin="my_id4e",idout="my_id4f") aa$code_region_2015_bis <- NULL #_____________________________________________________________ ## 4gh Chaining on code_circo_europ & nom_prenom_std #### #_____________________________________________________________ # When there's no visible homonyms in any election in the european circ. # and there's no more than one birth date in the european circ. # we apply the same id to all nom_prenom_std aa$code_circo_europ_bis <- ifelse(is.na(aa$code_circo_europ),aa$id_election_candidat,aa$code_circo_europ) aa$key <- paste(aa$nom_prenom_std,aa$election,aa$code_circo_europ_bis) id_match(idin="my_id4f",idout="my_id4g") aa$key <- paste(aa$nom_prenom_std,aa$code_circo_europ_bis) id_match(idin="my_id4g",idout="my_id4h") aa$code_circo_europ_bis <- NULL qq4 #_____________________________________________________________ # 5 By election #### #_____________________________________________________________ aa$my_id5a <- NULL aa$key <- paste(aa$nom_prenom_std,aa$election_bak) id_match(idin="my_id4h",idout="my_id5a") aa$election_nationale <- ifelse(aa$election %in% c("europeenne","legislative","ministere","presidentielle","senatoriale"), "nationale", ifelse(aa$election %in% c("maire"), "municipale", aa$election_bak)) aa$my_id5b <- NULL aa$key <- paste(aa$nom_prenom_std,aa$election_nationale) id_match(idin="my_id5a",idout="my_id5b") qq4 str(aa) #_____________________________________________________________ # 6 Mild Chaining on Different spellings #### #_____________________________________________________________ #_____________________________________________________________ ## 6a 2nd round name correction: nom_prenom_std_cor #### #_____________________________________________________________ nom_prenom_cor <- aa[aa$nom_prenom_std_cor != aa$nom_prenom_std,c("nom_prenom_std","nom_prenom_std_cor","prenom_std_cor")] nom_prenom_cor <- unique(nom_prenom_cor) colnames(nom_prenom_cor) <- c("nom_prenom_std","nom_prenom_std_cor2","prenom_std_cor2") str(nom_prenom_cor) nrow(aa) aa$nom_prenom_std_cor2 <- NULL aa <- merge(aa,nom_prenom_cor,by="nom_prenom_std", all.x= T) nrow(aa) table(is.na(aa$nom_prenom_std_cor2)) aa$nom_prenom_std_cor2 <- ifelse(is.na(aa$nom_prenom_std_cor2), aa$nom_prenom_std_cor, aa$nom_prenom_std_cor2) aa$prenom_std_cor2 <- ifelse(is.na(aa$prenom_std_cor2), aa$prenom_std_cor, aa$prenom_std_cor2) aa$key <- aa$nom_prenom_std_cor2 id_match(idin="my_id5b",idout="my_id6a") nrow(aa) #_____________________________________________________________ ## 6b Chaining on nom_std prenom_1_std_cor2 #### #_____________________________________________________________ aa$ok <- NULL aa$prenom_1_std_cor2 <- gsub("([A-Za-z]+).*", "\\1", aa$prenom_std_cor2) aa$nom_prenom_1_std_cor2 <- paste(aa$nom_std,aa$prenom_1_std_cor2) aa$nbprenom <- lengths(gregexpr("\\W+", aa$prenom_std_cor2)) aa$key <- aa$nom_prenom_1_std_cor2 id_match(idin="my_id6a",idout="my_id6b") # Checking if multiple prenom2 among Affected ids aa[,`:=` (nb_chg=sum((my_id6b != my_id6a + alpha) & (my_id6b != my_id6a))), by=paste(my_id6b)] # pp_m : match first names database pp_m <- aa[nb_chg>0,c("my_id6b","prenom_1_std_cor2","prenom_std_cor2","nom_prenom_1_std_cor2","nom_prenom_std_cor2")] pp_m$prenom2 <- sapply(1:nrow(pp_m), function(x) gsub(pp_m$prenom_1_std_cor2[x], "", pp_m$prenom_std_cor2[x])) pp_m[,`:=` (nb_prenom2=sum(my_id6b>0)),by=paste(my_id6b,prenom2)] pp_m[,`:=` (nb_prenom_vide=sum(prenom2 %in% c(""," ",NA))),by=paste(my_id6b)] pp_m[,`:=` (nb_id=sum(my_id6b>0)),by=paste(my_id6b)] # Ok are correct matches pp_m[,`:=` (ok=max(ifelse(prenom2 %in% c(""," ",NA)==F & (nb_prenom_vide + nb_prenom2==nb_id),1,0))),by=paste(my_id6b)] aa$ok <- NULL aa <- merge(aa,unique(pp_m[,c("my_id6b","ok")]),by="my_id6b",all.x = T) aa$my_id6b <- ifelse(aa$ok %in% 0,aa$my_id6a+beta,aa$my_id6b) qq4_fid <- length(table(aa$my_id6b)) print(qq4_fid) qq4_fid <- data.frame(qq4_fid) colnames(qq4_fid) <- c("n") qq4_fid$id <- "my_id6b" qq4 <- rbind(qq4,qq4_fid) # Correction of duplicated identification by election aa[,`:=` (nb_pb=sum(count)),by=paste(my_id6b,election_an_tour,date_debut_ministere)] aa[,`:=` (max_nb_pb=max(nb_pb)),by=paste(my_id6b)] qq5_fid <- table(aa$max_nb_pb) print("Number of identifications with different application for the same election at the same date (pb >1)") print(qq5_fid) # Correction of duplicated identification by date birth aa[,`:=` (nb_my_id_nais=sum(count)),by=paste(my_id6b,date_naissance_cor)] aa[,`:=` (nb_my_id_nais_dif=sum(ifelse(is.na(date_naissance_cor)==T,0,1/nb_my_id_nais))),by=my_id6b] qq6_fid <- table(aa$nb_my_id_nais_dif) print("Number of identification with different birth dates (pb >1)") print(qq6_fid) qq5_fid <- data.frame(qq5_fid) colnames(qq5_fid) <- c("Var","Freq") qq5_fid$id <- "my_id6b" qq5 <- rbind(qq5,qq5_fid) qq6_fid <- data.frame(qq6_fid) colnames(qq6_fid) <- c("Var","Freq") qq6_fid$id <- "my_id6b" qq6 <- rbind(qq6,qq6_fid) # 18 incorrect matches --> corrected by hand for 4d # View(aa[aa$max_nb_pb>1 | nb_my_id_nais_dif>1,]) aa$idlast <- ifelse(aa$max_nb_pb>1 | aa$nb_my_id_nais_dif>1,aa$my_id4d,aa$my_id6b) # Correction of duplicated identification by election aa[,`:=` (nb_pb=sum(count)),by=paste(idlast,election_an_tour,date_debut_ministere)] aa[,`:=` (max_nb_pb=max(nb_pb)),by=paste(idlast)] qq5_fid <- table(aa$max_nb_pb) print("Number of identifications with different application for the same election at the same date (pb >1)") print(qq5_fid) # Correction of duplicated identification by date birth aa[,`:=` (nb_my_id_nais=sum(count)),by=paste(idlast,date_naissance_cor)] aa[,`:=` (nb_my_id_nais_dif=sum(ifelse(is.na(date_naissance_cor)==T,0,1/nb_my_id_nais))),by=idlast] qq6_fid <- table(aa$nb_my_id_nais_dif) print("Number of identification with different birth dates (pb >1)") print(qq6_fid) qq4_fid <- length(table(aa$idlast)) print(qq4_fid) qq4_fid <- data.frame(qq4_fid) colnames(qq4_fid) <- c("n") qq4_fid$id <- "idlast" qq4 <- rbind(qq4,qq4_fid) qq5_fid <- data.frame(qq5_fid) colnames(qq5_fid) <- c("Var","Freq") qq5_fid$id <- "idlast" qq5 <- rbind(qq5,qq5_fid) qq6_fid <- data.frame(qq6_fid) colnames(qq6_fid) <- c("Var","Freq") qq6_fid$id <- "idlast" qq6 <- rbind(qq6,qq6_fid) # saveRDS(aa,"aa_tmp2.rds") aa[,which(substr(names(aa),1,5) %in% "my_id"):= NULL] aa[,c("nb_my_id_nais","nb_pb","max_nb_pb","nb_my_id_nais_dif","match_unit_1","match_unit","key","ok","nbprenom"):=NULL] str(aa) qq4 <- rbind(qq4,qq4_fid) qq4 #--------------------------------------# # Reappending, cleaning and save #### #--------------------------------------# ## Final id personne #### aa[,`:=` (id_personne_long=min(id_election_candidat_long)),by=paste(idlast)] nrow(aa) ## More matches round 1 & 2 #### #Checks aa_t1 <- aa[aa$tour==1 & aa$nb_tour==2,c("idlast","tour","election_an","code_unite")] aa_t1$tour <- 2 aa_t1$match_t1 <- 1 aa_t2 <- aa[aa$tour==2,c("idlast","tour","election_an","code_unite")] aa_t2t1 <- merge(aa_t2,aa_t1,by=c("idlast","election_an","tour","code_unite"),all.x=T) qq2_idlast <- table(1-(is.na(aa_t2t1$match_t1))) qq3_idlast <- table(aa_t2t1$election_an,is.na(aa_t2t1$match_t1)) qq2_idlast qq3_idlast qq2_idlast <- data.frame(qq2_idlast) colnames(qq2_idlast) <- c("Var","Freq") qq2_idlast$id <- "idlast" qq3_idlast <- data.frame(qq3_idlast) colnames(qq3_idlast) <- c("Var1","Var2","Freq") qq3_idlast$id <- "idlast" qq2 <- rbind.fill(qq2,qq2_idlast) qq3 <- rbind.fill(qq3,qq3_idlast) #Merge aa[,`:=` (id_election_candidat_t1=max((tour==1 & nb_tour==2)*id_election_candidat)),by=paste(id_personne_long,election_an,code_unite,date_debut_ministere)] aa$id_election_candidat_t1 <- ifelse(aa$tour!=2 | aa$id_election_candidat_t1 %in% 0,NA,aa$id_election_candidat_t1) aa[,`:=` (id_election_candidat_t2b=max((tour==2)*id_election_candidat)),by=paste(id_personne_long,election_an,code_unite,date_debut_ministere)] table(is.na(aa$id_election_candidat_t2b),aa$tour) table(aa$id_election_candidat_t2b==0,aa$tour) table(is.na(aa$id_election_candidat_t2)) aa$id_election_candidat_t2b <- ifelse(aa$tour==1 & aa$id_election_candidat_t2b != 0,aa$id_election_candidat_t2b,NA) aa$id_election_candidat_t2 <- ifelse(is.na(aa$id_election_candidat_t2), aa$id_election_candidat_t2b, aa$id_election_candidat_t2) table(is.na(aa$id_election_candidat_t2)) aa$elu_t1 <- NULL ss_t2b <- aa[aa$tour==2 & is.na(aa$id_election_candidat_t1)==F,] nrow(ss_t2b) ss_t2b$id_personne_long <- NULL colnames(ss_t2b) ss_t2c <- rbind.fill(ss_t2,ss_t2b) colnames(ss_t2c) ## Appending the two rounds ##### # Second round checks # Check duplicates ss_t2d <- merge(ss_t2c,aa[aa$tour == 1 & aa$nb_tour==2,c("id_election_candidat_t2","id_personne_long")], by.x="id_election_candidat", by.y="id_election_candidat_t2",all.x=T) table(is.na(ss_t2d$id_personne_long)) aa <- aa[(aa$tour==2 & is.na(aa$id_election_candidat_t1)==F)==F,] nrow(aa) nrow(aa)+nrow(ss_t2d) ss_t2d$t1b <- NULL aa <- rbind.fill(aa,ss_t2d) aa <- data.table(aa) nrow(aa) rm(ss_t2b,ss_t2c,ss_t2d) str(aa) table(is.na(aa$id_personne_long)) length(table(aa$id_personne_long)) colnames(aa) table(aa$tour,useNA = "ifany") aa_t1 <- aa[aa$tour==1 & aa$nb_tour==2,c("id_personne_long","tour","election_an","code_unite","id_election_candidat_t2")] aa_t1$tour <- 2 aa_t1$match_t1 <- 1 aa_t2 <- aa[aa$tour==2 ,c("id_personne_long","tour","election_an","code_unite","id_election_candidat_t1")] aa_t2t1 <- merge(aa_t2,aa_t1,by=c("id_personne_long","election_an","code_unite","tour"),all.x=T) qq2_idpers <- table(1-(is.na(aa_t2t1$match_t1))) qq3_idpers <- table(aa_t2t1$election_an,is.na(aa_t2t1$match_t1)) qq2_idpers qq2_idpers <- data.frame(qq2_idpers) colnames(qq2_idpers) <- c("Var","Freq") qq2_idpers$id <- "idpers" qq3_idpers <- data.frame(qq3_idpers) colnames(qq3_idpers) <- c("Var1","Var2","Freq") qq3_idpers$id <- "idpers" qq2 <- rbind.fill(qq2,qq2_idpers) qq3 <- rbind.fill(qq3,qq3_idpers) aa$election <- aa$election_bak aa$election_an <- aa$election_an_bak aa$election_an_tour <- aa$election_an_tour_bak # We redo some t1 t2 variables variables aa[,`:=` (elu_t1=max((tour==1)*elu_ce_tour)),by=paste(id_personne_long,election_an,date_debut_ministere)] aa[,`:=` (elu_t2=max((tour==2)*elu_ce_tour)),by=paste(id_personne_long,election_an,date_debut_ministere)] table(aa$elu_t2,aa$elu_t1,useNA="ifany") aa$elu_t2 <- ifelse(aa$nb_tour ==1 ,NA,aa$elu_t2) table(aa$elu_t2,aa$tour) aa$elu_in_fine <- ifelse(aa$tour==9,NA,( aa$elu_t1 %in% 1 | aa$elu_t2 %in% 1 | aa$elu_ce_tour %in% 1) *1) table(aa$tour,aa$elu_in_fine,useNA = "ifany") table(aa$tour,aa$elu_t1,useNA = "ifany") table(aa$tour,aa$elu_t2,useNA = "ifany") table(aa$election_an_tour,aa$elu_in_fine,useNA = "ifany") aa$id_personne <- 10^7 + as.numeric(as.factor(aa$id_personne_long)) qq4_fid <- length(table(aa$id_personne)) print(qq4_fid) qq4_fid <- data.frame(qq4_fid) colnames(qq4_fid) <- c("n") qq4_fid$id <- "id_personne" qq4 <- rbind(qq4,qq4_fid) gc() # Selection of variables bb <- aa [,list(id_election_candidat, id_election_candidat_long, id_personne, id_personne_long, id_fichier, election, scrutin, date, annee, nb_tour, tour, election_an, election_an_tour, code_circo_europ, code_region_1982, code_region_2015, code_departement, code_circo_leg, code_canton, code_commune, code_commune_section, code_unite, libelle_unite, suppleant, id_nom_liste, nom, prenom, nom_std, prenom_std, nom_prenom_std, nom_prenom_std_cor, date_naissance, date_naissance_cor, sexe, profession, code_csp, code_csp_std, sortant, nuance, inscrits, votants, abstentions, exprimes, voix, intitule_liste, sexe_tete_liste, nom_tete_liste, prenom_tete_liste, rang_liste, nb_liste, max_rang_liste, sieges_liste, elu_ce_tour, elu_t1, elu_t2, elu_in_fine, id_election_candidat_t1, id_election_candidat_t2, date_debut_ministere)] bb <- bb[order(bb$id_personne,bb$date),] rownames(bb) <- NULL ## Final Export #### str(bb) bb <- as.data.frame(bb) str(bb) saveRDS(bb,"base_candidatEs_id.rds") # write.csv(bb,"base_candidatEs_id.csv",na='', row.names=FALSE,fileEncoding = "UTF-8") ## Quality statistics #### write.csv(qq1,"qq1.csv") write.csv(qq2,"qq2.csv") write.csv(qq3,"qq3.csv") write.csv(qq4,"qq4.csv") write.csv(qq5,"qq5.csv") write.csv(qq6,"qq6.csv") table(bb$election_an_tour[bb$tour==2], is.na(bb$id_election_candidat_t1)[bb$tour==2]) addmargins(table(bb$election[bb$tour==2], is.na(bb$id_election_candidat_t1[bb$tour==2]))) #-------------------------------# # Cleaning #### #-------------------------------# rm(list=ls()) gc() #-------------------------------# # End #### #-------------------------------#