#__________________________________________
# Library and upload  ####
#__________________________________________

# install.packages("RecordLinkage")
# install.packages("plyr")
# install.packages("stringi")
# install.packages("data.table")

library(RecordLinkage)
library("plyr",include.only = "rbind.fill")
library("dplyr")
library("stringi")
library("stringr")
library("data.table")

qq1 <- NULL
qq2 <- NULL
qq3 <- NULL
qq4 <- NULL
qq5 <- NULL
qq6 <- NULL

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

aa <- readRDS("base_candidatEs_public.rds")

# Export base candidats ####
# aa <- aa[,-which(names(aa) %in% c("nom_std","prenom_std","nom_prenom_std","epouse_std","nee_std","nom_dit_std","prenom_dit_std","scrutin_det"))]
# str(aa)
# saveRDS(aa,"base_candidats_public.rds")
# write.csv(aa,"base_candidats_public.csv",row.names=F,na='')

mycol <- colnames(aa)
mycol2 <- list(mycol)
str(mycol2)
str(aa)
nrow(aa)

aa <- data.table(aa)

aa[,c("id_election_candidat","id_personne","id_personne_long"):=NULL]
# aa <- aa[,..mycol]

aa$id_election_candidat_long <- as.double(aa$id_election_candidat_long)
str(aa)

aa$count <- 1

#_______________________________________________________#
# Standardization of names (optional) ####
#_______________________________________________________#

# Switch to TRUE to rerun name standardization
nom_prenom_std <- TRUE

# nom_prenom_std <- TRUE
if (nom_prenom_std ==TRUE){
  
  # aa$nom_prenom_std_bak <- aa$nom_prenom_std
  # aa$nom_prenom_std <- aa$nom_prenom_std_bak
  # aa$nom_prenom_std <- NULL
  
  ## Cleaning Nom ####
  aa$nom_std <- str_to_lower(aa$nom)
  aa$nom_std <- stri_trans_general(aa$nom_std,id = "Latin-ASCII")
  aa$nom_std <- str_replace(aa$nom_std, "\\-", " ")
  aa$nom_std <- str_replace(aa$nom_std, "\\.", " ")
  aa$nom_std <- str_replace(aa$nom_std, "'", " ")
  aa$nom_std <- str_replace(aa$nom_std, "\\?", " ")
  aa$nom_std <- str_replace(aa$nom_std, ";", " ")
  aa$nom_std <- str_replace(aa$nom_std, ",", " ")
  aa$nom_std <- str_replace(aa$nom_std, "'", " ")

  ## Delete spouse name in parenthesis or ep. or epouse ####
  aa$nom_std <- sub("\\(.*\\)", "", aa$nom_std)
  aa$epouse_std  <- ifelse(regexpr(" ep ",aa$nom_std) > 0,
                           substr(aa$nom_std,regexpr(" ep ",aa$nom_std)+4,nchar(aa$nom_std)),
                           NA)
  aa$nom_std <- ifelse(regexpr(" ep ",aa$nom_std) > 0,
                       substr(aa$nom_std,1,regexpr(" ep ",aa$nom_std)),
                       aa$nom_std)
  aa$epouse_std  <- ifelse(regexpr(" epouse ",aa$nom_std) > 0,
                           substr(aa$nom_std,regexpr(" epouse ",aa$nom_std)+8,
                                  nchar(aa$nom_std)),aa$epouse_std)
  aa$nom_std <- ifelse(regexpr(" epouse ",aa$nom_std) > 0,
                       substr(aa$nom_std,1,regexpr(" epouse ",aa$nom_std)),
                       aa$nom_std)

  ## Dit ####
  aa$nom_dit_std  <- ifelse(regexpr(" dit ",aa$nom_std) > 0,
                            substr(aa$nom_std,regexpr(" dit ",aa$nom_std)+5,nchar(aa$nom_std)),
                            NA)
  aa$nom_std <- ifelse(regexpr(" dit ",aa$nom_std) > 0,
                       substr(aa$nom_std,1,regexpr(" dit ",aa$nom_std)),
                       aa$nom_std)
  aa$nom_dit_std  <- ifelse(regexpr(" dite ",aa$nom_std) > 0,
                            substr(aa$nom_std,regexpr(" dite ",aa$nom_std)+6,nchar(aa$nom_std)),
                            aa$nom_dit_std)
  aa$nom_std <- ifelse(regexpr(" dite ",aa$nom_std) > 0,
                       substr(aa$nom_std,1,regexpr(" dite ",aa$nom_std)),
                       aa$nom_std)
  aa$nom_std <- sub("<.*<", "", aa$nom_std)
  aa$nom_std <- str_replace(aa$nom_std, "<", " ")

  ## Nee ####
  aa$nee_std  <- ifelse(regexpr(" nee ",aa$nom_std) > 0,
                        substr(aa$nom_std,regexpr(" nee ",aa$nom_std)+5,
                               nchar(aa$nom_std)),NA)
  
  aa$nom_std <- ifelse(regexpr(" nee ",aa$nom_std) > 0,
                       substr(aa$nom_std,1,regexpr(" nee ",aa$nom_std)),
                       aa$nom_std)

  ## Removing multiple white spaces ####
  aa$nom_std <- str_squish(aa$nom_std)

  ## Triming ####
  aa$nom_std <- str_trim(aa$nom_std)

  ## Cleaning Prenom ####
  aa$prenom_std <- str_to_lower(aa$prenom)
  aa$prenom_std <- stri_trans_general(aa$prenom_std,id = "Latin-ASCII")

  aa$prenom_std <- ifelse(aa$sexe %in% "M",str_replace(aa$prenom_std, "j\\.", "jean "),aa$prenom_std)
  aa$prenom_std <- ifelse(aa$sexe %in% "M",str_replace(aa$prenom_std, "f\\.", "francois "),aa$prenom_std)
  aa$prenom_std <- ifelse(aa$sexe %in% "M",str_replace(aa$prenom_std, "m\\.", "marc "),aa$prenom_std)
  aa$prenom_std <- ifelse(aa$sexe %in% "M",str_replace(aa$prenom_std, "p\\.", "pierre "),aa$prenom_std)
  aa$prenom_std <- ifelse(aa$sexe %in% "M",str_replace(aa$prenom_std, "y\\.", "yves "),aa$prenom_std)
  aa$prenom_std <- ifelse(aa$sexe %in% "F",str_replace(aa$prenom_std, "a\\.", "anne "),aa$prenom_std)
  aa$prenom_std <- ifelse(aa$sexe %in% "F",str_replace(aa$prenom_std, "m\\.", "marie "),aa$prenom_std)

  aa$prenom_std <- str_replace(aa$prenom_std, "\\-", " ")
  aa$prenom_std <- str_replace(aa$prenom_std, "\\.", " ")
  aa$prenom_std <- str_replace(aa$prenom_std, "\\?", " ")
  aa$prenom_std <- str_replace(aa$prenom_std, ";", " ")
  aa$prenom_std <- str_replace(aa$prenom_std, ",", " ")
  aa$prenom_std <- str_replace(aa$prenom_std, "'", " ")

  ## Prenom Dit ####
  aa$prenom_dit_std  <- ifelse(regexpr(" dit ",aa$prenom_std) > 0,
                               substr(aa$prenom_std,regexpr(" dit ",aa$prenom_std)+5,
                                      nchar(aa$prenom_std)),NA)
  aa$prenom_std <- ifelse(regexpr(" dit ",aa$prenom_std) > 0,
                          substr(aa$prenom_std,1,regexpr(" dit ",aa$prenom_std)),
                          aa$prenom_std)
  aa$prenom_dit_std  <- ifelse(regexpr(" dite ",aa$prenom_std) > 0,
                               substr(aa$prenom_std,regexpr(" dite ",aa$prenom_std)+6,nchar(aa$prenom_std)),
                               aa$prenom_dit_std)
  aa$prenom_std <- ifelse(regexpr(" dite ",aa$prenom_std) > 0,
                          substr(aa$prenom_std,1,regexpr(" dite ",aa$prenom_std)),
                          aa$prenom_std)
  
  
  aa$prenom_std <- sub("<.*<", "", aa$prenom_std)
  aa$prenom_std <- str_replace(aa$prenom_std, "<", " ")

  aa$prenom_std <- sub("\\(.*\\)", "", aa$prenom_std) #Delete other firstname in parenthesis

  aa$prenom_std <- str_squish(aa$prenom_std)
  aa$prenom_std <- str_trim(aa$prenom_std)

  ## nom_prenom_std ####
  aa$nom_prenom_std <- str_trim(paste(aa$nom_std,aa$prenom_std,sep=" "))
  
  ## Error name duplicate in the cantonale 1998 ####

  aa$nb_dup_dep_election <- ave(aa$count,
                                paste(aa$nom_prenom_std,aa$code_departement,aa$election_an_tour),FUN=sum)
  
  table(aa$election_an,aa$nb_dup_dep_election)
  
  aa$nom_prenom_std <- ifelse(aa$nb_dup_dep_election > 2 & aa$election_an=="cantonale 1998",
                              paste(aa$nom_prenom_std,"Erreur Nom",aa$id_election_candidat_long),aa$nom_prenom_std)
  
  aa$nb_dup_dep_election <- NULL

}


aa <- aa[order(aa$nom_prenom_std,
               -is.na(aa$date_naissance)*1,
               -aa$date_naissance,
               aa$date,
               -aa$election),]

aa$id_election_candidat <- as.double(cumsum(aa$count) + 10^7)

rownames(aa) <- NULL

#_____________________________________________________________#
# Matching the two rounds of different elections ####
#_____________________________________________________________#



aa$election_bak <- aa$election
aa$election_an_bak <- aa$election_an
aa$election_an_tour_bak <- aa$election_an_tour

aa$election[aa$election %in% c("municipale (petites communes)","maire")] <- "municipale"
aa$election_an <- paste(aa$election, aa$annee)
aa$election_an_tour <- paste(aa$election,aa$annee, aa$tour)



## Rounds matching function ####
aa$code_unite_election_an <- paste(aa$code_unite,aa$election_an)
gc()


rnd_match <- function (idin="",idout="") {
  
  aa[,which(names(aa) %in% c(idout))] <- NULL
  
  aa$my_fid_1 <- aa[[idin]]
  
  # New id as the max of id by unit, with creation of a new
  aa[,`:=` (my_fid=max(my_fid_1)),by=paste(key,code_unite_election_an)]
  
  # Rematching the ids
  aa[,`:=` (my_fid1=max(my_fid)),by=my_fid_1]
  
  # Number of initial matches
  print("Number of initial matches")
  print(table(1-(aa$my_fid==aa$my_fid_1)))
  
  # Number of rematches after looping on previous ids
  print("Number of rematches after looping on previous ids")
  print(table(1-(aa$my_fid1==aa$my_fid_1)))
  
  aa$my_fid <- aa$my_fid1
  
  # Check if identification creates duplicates
  aa[,`:=` (nb_pb=sum(count)),by=paste(my_fid,election_an_tour,date_debut_ministere)]
  aa[,`:=` (max_nb_pb=max(nb_pb)),by=paste(my_fid)]
  
  print("Number of identifications with different application for the same election at the same date (pb >1)")
  print(table(aa$max_nb_pb))
  
  # Already matched
  aa[,`:=` (match_unit_1=sum(count)),by=paste(my_fid_1,code_unite_election_an)]
  
  # Not matched
  aa[,`:=` (match_unit=sum(count)),by=paste(my_fid,code_unite_election_an)]
  
  aa$my_fid <- ifelse(aa$max_nb_pb>1 | aa$match_unit_1==2 | aa$match_unit==1,aa$my_fid_1,aa$my_fid)
  
  # View(aa[aa$max_nb_pb>1,c("nom_prenom_std","my_fid","election_an_tour","date_debut_ministere","code_unite")])
  
  # Correction of duplicated identification by date birth
  aa[,`:=` (nb_my_fid_1_nais=sum(count)),by=paste(my_fid,date_naissance)]
  aa[,`:=` (nb_my_fid_1_nais_dif=sum(ifelse(is.na(date_naissance)==T,0,1/nb_my_fid_1_nais))),by=my_fid]
  
  print("Number of identification with different birth dates (pb >1)")
  print(table(aa$nb_my_fid_1_nais_dif))
  
  # Final Identification
  aa$my_fid <- ifelse(aa$nb_my_fid_1_nais_dif>1,aa$my_fid_1,aa$my_fid)
  
  #Quality check
  qq1_fid <- table(1-(aa$my_fid != (aa$my_fid_1)) )
  print("Number of changes in Ids")
  print(qq1_fid)
  
  
  aa$my_fid <- aa$my_fid
  
  aa_t1 <- aa[aa$tour==1 & aa$nb_tour==2,c("my_fid","tour","election_an")]
  aa_t1$tour  <- 2
  aa_t1$match_t1  <- 1
  aa_t2 <- aa[aa$tour==2,c("my_fid","tour","election_an")]
  aa_t2t1 <- merge(aa_t2,aa_t1,by=c("my_fid","election_an","tour"),all.x=T)
  qq2_fid <- table(1-is.na(aa_t2t1$match_t1))
  qq3_fid <- table(aa_t2t1$election_an,1-is.na(aa_t2t1$match_t1))
  
  print("Number of second rounds foudn in first round")
  print(qq2)
  
  print("Number of second rounds foudn in first round by election")
  print(qq3_fid)
  
  
  nrow(aa_t2)
  nrow(aa_t2t1)
  
  
  aa[,c("nb_my_fid_1_nais","nb_pb","max_nb_pb","nb_my_fid_1_nais_dif","match_unit","match_unit_1"):=NULL]
  
  qq1_fid <- data.frame(qq1_fid)
  colnames(qq1_fid) <- c("Var","Freq")
  qq1_fid$id <- idout
  
  qq2_fid <- data.frame(qq2_fid)
  colnames(qq2_fid) <- c("Var","Freq")
  qq2_fid$id <- idout
  
  
  qq3_fid <- data.frame(qq3_fid)
  colnames(qq3_fid) <- c("Var1","Var2","Freq")
  qq3_fid$id <- idout
  
  qq1 <<- rbind.fill(qq1,qq1_fid)
  qq2 <<- rbind.fill(qq2,qq2_fid)
  qq3 <<- rbind.fill(qq3,qq3_fid)
  
  qq4_fid <- length(table(aa$my_fid))
  qq4_fid <- data.frame(qq4_fid)
  colnames(qq4_fid) <- c("n")
  qq4_fid$id <- idout
  qq4 <- rbind(qq4,qq4_fid)
  
  colnames(aa)[which(names(aa) %in% c("my_fid"))] <- idout
   
  
  aa[,which(names(aa) %in% c(paste(idin,"_1",sep="")))] <- NULL
  aa[,which(names(aa) %in% c(paste(idout,"1",sep="")))] <- NULL
  aa[,c("my_fid_1","my_fid1"):=NULL]
  
  aa <<- aa
  qq4 <<- qq4
  
  print("Number of ids")
  print(idout)
  print(qq4_fid)
}

#_____________________________________________________________
## 2 Chaining on code_unite_election_an & nom_prenom_std & date_naissance ####
#_____________________________________________________________

aa$key <- ifelse(is.na(aa$date_naissance)==F,aa$id_election_candidat,
                 paste(aa$nom_prenom_std,aa$date_naissance))
                 
rnd_match(idin="id_election_candidat",idout="my_id2a")
colnames(aa)


#_____________________________________________________________
## 2b Chaining on code_unite_election_an & nom_prenom_std ####
#_____________________________________________________________

# When there's no visible homonyms in any election in the commune
# and there's no more than one birth date in the commune
# we apply the same id to all nom_prenom_std

str(aa)
aa$key <- aa$nom_prenom_std
rnd_match(idin="my_id2a",idout="my_id2b")

#_____________________________________________________________
## 2c Chaining on code_unite_election_an, nom_std, sexe & date_naissance ####
#_____________________________________________________________

aa$key <- ifelse(is.na(aa$date_naissance)==F,aa$my_id2b,
                 paste(aa$nom_std,aa$sexe,aa$date_naissance))
rnd_match(idin="my_id2b",idout="my_id2c")

#_____________________________________________________________
## 2d Chaining on code_unite_election_an, nom_std, sexe ####
#____________________________________________________________

aa$key <- paste(aa$nom_std,aa$sexe)
rnd_match(idin="my_id2c",idout="my_id2d")

#____________________________________________________________
## 2e Chaining on code_unite_election_an, prenom_std, sexe, nuance, date_naissance,  ####
#_____________________________________________________________

aa$key <- ifelse(is.na(aa$date_naissance)==F,aa$my_id2d,
                 paste(aa$prenom_std,aa$sexe,aa$date_naissance,aa$nuance))
rnd_match(idin="my_id2d",idout="my_id2e")

#_____________________________________________________________
## 2f Chaining on code_unite_election_an, prenom_std, sexe, nuance ####
#_____________________________________________________________

aa$key <- paste(aa$prenom_std,aa$sexe,aa$nuance)
rnd_match(idin="my_id2e",idout="my_id2f")

#_____________________________________________________________
## 2g Chaining on code_unite_election_an, prenom_std, sexe, nom_tete_liste, date_naissance ####
#_____________________________________________________________

aa$key <- ifelse(is.na(aa$date_naissance)==F,aa$my_id2f,
                 paste(aa$prenom_std,aa$sexe,aa$date_naissance,tolower(aa$nom_tete_liste)))
rnd_match(idin="my_id2f",idout="my_id2g")

#_____________________________________________________________
## 2h Chaining on code_unite_election_an, prenom_std, sexe, nom_tete_liste ####
#_____________________________________________________________

aa$key <- paste(aa$prenom_std,aa$sexe,tolower(aa$nom_tete_liste))
rnd_match(idin="my_id2g",idout="my_id2h")

#_____________________________________________________________
## 2i Chaining on code_unite_election_an, prenom_std, sexe ####
#_____________________________________________________________

aa$key <- paste(aa$prenom_std,aa$sexe)
rnd_match(idin="my_id2h",idout="my_id2i")

#_____________________________________________________________
# Cleaning of round 1-2 matches ####
#_____________________________________________________________

# aa[,c("id_election_candidat_t2","name_cor_ok.x","nom_prenom_std_cor","nom_std_cor","prenom_std_cor","elu_t2","date_naissance_cor2","t2","t2.y","name_cor_ok.y"):=NULL]
str(aa)
gc()

aa$date_naissance_cor <- aa$date_naissance

aa_t2 <- aa[aa$tour==2,c("my_id2i","tour","election_an","id_election_candidat","nom_prenom_std","nom_std","prenom_std","elu_ce_tour","date_naissance_cor")]
aa_t2$tour <- 1
aa_t2$t2 <- 1
colnames(aa_t2)[4:9] <- c("id_election_candidat_t2","nom_prenom_std_cor","nom_std_cor","prenom_std_cor","elu_t2","date_naissance_cor2")
str(aa)
str(aa_t2)


aa <- merge(aa,aa_t2,by=c("my_id2i","election_an","tour"),all.x=T)
nrow(aa)

aa$nom_prenom_std_cor <- ifelse(is.na(aa$nom_prenom_std_cor)==T,aa$nom_prenom_std,aa$nom_prenom_std_cor)
aa$nom_std_cor <- ifelse(is.na(aa$nom_std_cor)==T,aa$nom_std,aa$nom_std_cor)
aa$prenom_std_cor <- ifelse(is.na(aa$prenom_std_cor)==T,aa$prenom_std,aa$prenom_std_cor)

## Name match correction cleaning ####
# We check if first names match do not lead to too distant last names
check_name <- aa[aa$nom_std_cor != aa$nom_std,c("my_id2i","election_an","code_unite","nom_std","nom_std_cor","sexe","nee_std","nom_dit_std")]
check_name$prox <- levenshteinSim(check_name$nom_std,check_name$nom_std_cor)
check_name$prox2 <- (substr(check_name$nom_std,1,3)==substr(check_name$nom_std_cor,1,3))*1
check_name$name_cor_ok <- (check_name$prox>0.5 | check_name$prox2==1)*1
table(check_name$name_cor_ok)

aa <- merge(aa,check_name[,c("my_id2i","name_cor_ok")],by="my_id2i",all.x=T)
table(aa$name_cor_ok)

aa$date_naissance_cor2 <- ifelse(aa$name_cor_ok  %in% 0,NA,aa$date_naissance_cor2)
aa$my_id2j <- ifelse(aa$name_cor_ok %in% 0,aa$my_id2d,aa$my_id2i)
aa$nom_prenom_std_cor <- ifelse(aa$name_cor_ok %in% 0,aa$nom_prenom_std,aa$nom_prenom_std_cor)
aa$nom_std_cor <- ifelse(aa$name_cor_ok %in% 0,aa$nom_std,aa$nom_std_cor)
aa$prenom_std_cor <- ifelse(aa$name_cor_ok %in% 0,aa$prenom_std,aa$prenom_std_cor)
aa$id_election_candidat_t2 <- ifelse(aa$name_cor_ok %in% 0,NA,aa$id_election_candidat_t2)
aa$t2 <- ifelse(aa$name_cor_ok %in% 0,NA,aa$t2)
aa$elu_t2 <- ifelse(aa$name_cor_ok %in% 0,NA,aa$elu_t2)


aa$date_naissance_cor <- ifelse(is.na(aa$date_naissance_cor)==T,aa$date_naissance_cor2,aa$date_naissance_cor)
aa$date_naissance_cor2 <- NULL


## Match t2 & t1 ####
aa$t2 <- ifelse(aa$tour==1 & aa$nb_tour==2 & is.na(aa$t2)==T,0,aa$t2)

table(aa$t2)

aa_t1 <- aa[aa$tour==1  & aa$nb_tour==2,c("my_id2j","tour","election_an")]
aa_t1$tour  <- 2
aa_t1$match_t1  <- 1
aa_t2 <- aa[aa$tour==2,c("my_id2j","tour","election_an")]
aa_t2t1 <- merge(aa_t2,aa_t1,by=c("my_id2j","election_an","tour"),all.x=T)
qq2_id2j <- table(1-(is.na(aa_t2t1$match_t1)))
qq3_id2j <- table(aa_t2t1$election_an,is.na(aa_t2t1$match_t1))
nrow(aa_t2)
nrow(aa_t2t1)


qq2_id2j <- data.frame(qq2_id2j)
colnames(qq2_id2j) <- c("Var","Freq")
qq2_id2j$id <- "id2j"

qq3_id2j <- data.frame(qq3_id2j)
colnames(qq3_id2j) <- c("Var1","Var2","Freq")
qq3_id2j$id <- "id2j"

qq2 <- rbind.fill(qq2,qq2_id2j)
qq3 <- rbind.fill(qq3,qq3_id2j)

qq4_fid <- length(table(aa$my_id2j))
print(qq4_fid)
qq4_fid <- data.frame(qq4_fid)
colnames(qq4_fid) <- c("n")
qq4_fid$id <- "my_id2j"
qq4 <- rbind(qq4,qq4_fid)


#------------------------------#
## The 2 rounds ####
#------------------------------#

aa_t1 <- aa[aa$tour==1  & aa$nb_tour==2,c("my_id2j","tour","election_an","id_election_candidat","elu_ce_tour")]
aa_t1$tour <- 2
aa_t1$t1 <- 1
colnames(aa_t1)[c(4:5)] <- c("id_election_candidat_t1","elu_t1")

aa <- merge(aa,aa_t1,by=c("my_id2j","election_an","tour"),all.x=T)
nrow(aa)


aa$t1 <- ifelse(aa$tour==2 & is.na(aa$t1)==T,0,aa$t1)
table(aa$t1)

ss_t2 <- aa[aa$tour==2 & aa$t1==1,]
nrow(ss_t2)

aa <- aa[(aa$tour==2 & aa$t1==1)==F,]
nrow(aa)
# nrow(bb)+nrow(ss_t2)
# rm(bb)

aa[,which(substr(names(aa),1,5) %in% "my_id"):= NULL]
ss_t2[,which(substr(names(ss_t2),1,5) %in% "my_id"):= NULL]
str(ss_t2)
nrow(ss_t2)
nrow(aa)
nrow(aa)+nrow(ss_t2)
rm(aa_t1,aa_t2,aa_t2t1)
gc()
rm(nn,nn_m)
str(aa)

# saveRDS(aa,"aa_tmp.RDS")
# saveRDS(ss_t2,"ss_t2_tmp.RDS")
# ss_t2 <- readRDS("ss_t2_tmp.RDS")
# aa <- readRDS("aa_tmp.RDS")

#_____________________________________________________________
# 3 Base chaining  ####
#_____________________________________________________________

aa <- aa[order(aa$id_election_candidat),]
rownames(aa) <- NULL
aa$count <- 1
str(aa)

set.seed(5555)
aa$epsilon <- round(runif(nrow(aa))/1000,10)


## Chaining function ####
# Not totally necessary
# But enables to prioritize id of already matched
# (Makes a small difference in the last rounds)
alpha <- 10^8
beta <- 10^10

# When there's no visible homonyms in any election in the unit
# and there's no more than one birth date in the unit
# we apply the same id to all key

id_match <- function (idin="",idout="") {
  
  aa[,which(names(aa) %in% c(idout))] <- NULL
  
  # Identification
  aa$my_fid_1 <- aa[[idin]]
  
  # Initial match
  aa[,`:=` (my_fid=max(alpha+my_fid_1)),by=key]
  # Relooping on previous ids
  aa[,`:=` (my_fid1=max(my_fid)),by=my_fid_1]
  
  # Change ids
  print("Initial id changes")
  print(table(1-(aa$my_fid==alpha+aa$my_fid_1)))
  
  print("Id changes after relooping on previous ids")
  print(table(1-(aa$my_fid1==alpha+aa$my_fid_1)))
  
  aa$my_fid <- aa$my_fid1
  
  # checking number of identifications
  aa[,`:=` (nb_id_1=sum(count+epsilon)),by=paste(my_fid_1)]
  aa[,`:=` (nb_id=sum(count+epsilon)),by=paste(my_fid)]
  
  print("Change in the number of obs to which an id is matched (!=0)")
  print(table(round(aa$nb_id-aa$nb_id_1)))
  
  # Correction of duplicated identification by election
  aa[,`:=` (nb_pb=sum(count)),by=paste(my_fid,election_an_tour,date_debut_ministere)]
  aa[,`:=` (max_nb_pb=max(nb_pb)),by=paste(my_fid)]
  qq5_fid <- table(aa$max_nb_pb)
  print("Number of identifications with different application for the same election at the same date (pb >1)")
  print(qq5_fid)
  
  aa$my_fid <- ifelse(aa$max_nb_pb>1 | (aa$nb_id-aa$nb_id_1)==0,aa$my_fid_1,aa$my_fid)
  
  # Correction of duplicated identification by date birth
  aa[,`:=` (nb_my_id_nais=sum(count)),by=paste(my_fid,date_naissance_cor)]
  aa[,`:=` (nb_my_id_nais_dif=sum(ifelse(is.na(date_naissance_cor)==T,0,1/nb_my_id_nais))),by=my_fid]
  qq6_fid <- table(aa$nb_my_id_nais_dif)
  print("Number of identification with different birth dates (pb >1)")
  print(qq6_fid)
  
  # Final identification
  aa$my_fid <- ifelse(aa$nb_my_id_nais_dif>1,aa$my_fid_1,aa$my_fid)
  
  # Quality of identification
  qq2_fid <- table(1- ((aa$my_fid!=aa$my_fid_1+alpha) & (aa$my_fid!=aa$my_fid_1)))
  print(qq2_fid)
  
  qq2_fid <- data.frame(qq2_fid)
  colnames(qq2_fid) <- c("Var","Freq")
  qq2_fid$id <- idout
  
  
  aa[,c("nb_my_id_nais","nb_pb","max_nb_pb","nb_my_id_nais_dif","nb_id","nb_id_1"):=NULL]
  
  qq4_fid <- length(table(aa$my_fid))
  
  qq4_fid <- data.frame(qq4_fid)
  colnames(qq4_fid) <- c("n")
  qq4_fid$id <- idout
  qq4 <<- rbind(qq4,qq4_fid)
  
  qq5_fid <- data.frame(qq5_fid)
  colnames(qq5_fid) <- c("Var","Freq")
  qq5_fid$id <- idout
  qq5 <<- rbind(qq5,qq5_fid) 
  
  qq6_fid <- data.frame(qq6_fid)
  colnames(qq6_fid) <- c("Var","Freq")
  qq6_fid$id <- idout
  qq6 <<- rbind(qq6,qq6_fid)
  
  aa$my_fid <- aa$my_fid + beta
  
  names(aa) <- gsub("my_fid",idout,names(aa))

  aa[,which(names(aa) %in% c(paste(idin,"_1",sep="")))] <- NULL
  aa[,which(names(aa) %in% c(paste(idout,"1",sep="")))] <- NULL
  aa[,c("my_fid_1","my_fid1"):=NULL]
  
  aa <<- aa
  
  
  print("Number of ids")
  print(idout)
  print(qq4_fid)
}

#_____________________________________________________________
## 3a Chaining on nom_prenom_std & date_naissance_cor ####
#_____________________________________________________________

aa$key <- paste(ifelse(is.na(aa$date_naissance_cor)==T,aa$id_election_candidat,aa$nom_prenom_std),aa$date_naissance_cor)
id_match(idin="id_election_candidat",idout="my_id3a")

#_____________________________________________________________
## 3b  Chaining on nom_prenom_std_cor & date_naissance_cor  ####
#_____________________________________________________________

aa$key <- paste(ifelse(is.na(aa$date_naissance_cor)==T,aa$my_id3a,aa$nom_prenom_std_cor),aa$date_naissance_cor)
id_match(idin="my_id3a",idout="my_id3b")
qq4
str(aa)

#_____________________________________________________________
## 3c Chaining on nom_prenom_std ####
#_____________________________________________________________

aa$key <- aa$nom_prenom_std
id_match(idin="my_id3b",idout="my_id3c")

#_____________________________________________________________
# 4 Geographical Chaining ####
#_____________________________________________________________

#_____________________________________________________________
## 4a Chaining on code_unite & nom_prenom_std ####
#_____________________________________________________________


aa$key <- paste(aa$nom_prenom_std,aa$code_unite)
id_match(idin="my_id3c",idout="my_id4a")

#_____________________________________________________________
## 4b Chaining on code_commune & nom_prenom_std ####
#_____________________________________________________________

aa$code_commune_bis <- ifelse(is.na(aa$code_commune),aa$id_election_candidat,aa$code_commune)
aa$key <- paste(aa$nom_prenom_std,aa$code_commune_bis)
id_match(idin="my_id4a",idout="my_id4b")
aa$code_commune_bis <- NULL

#_____________________________________________________________
## 4cd Chaining on code_departement & nom_prenom_std ####
#_____________________________________________________________

aa$code_departement_bis <- ifelse(is.na(aa$code_departement),aa$id_election_candidat,aa$code_departement)
aa$key <- paste(aa$nom_prenom_std,aa$election, aa$code_departement_bis)
id_match(idin="my_id4b",idout="my_id4c")

aa$key <- paste(aa$nom_prenom_std,aa$code_departement_bis)
id_match(idin="my_id4c",idout="my_id4d")

aa$code_departement_bis <- NULL
qq4

#_____________________________________________________________
## 4ef Chaining on code_region_2015 & nom_prenom_std ####
#_____________________________________________________________

aa$code_region_2015_bis <- ifelse(is.na(aa$code_region_2015),aa$id_election_candidat,aa$code_region_2015)

aa$key <- paste(aa$nom_prenom_std,aa$election,aa$code_departement_bis)
id_match(idin="my_id4d",idout="my_id4e")

aa$key <- paste(aa$nom_prenom_std,aa$code_departement_bis)
id_match(idin="my_id4e",idout="my_id4f")

aa$code_region_2015_bis <- NULL


#_____________________________________________________________
## 4gh Chaining on code_circo_europ & nom_prenom_std ####
#_____________________________________________________________

# When there's no visible homonyms in any election in the european circ.
# and there's no more than one birth date in the european circ.
# we apply the same id to all nom_prenom_std

aa$code_circo_europ_bis <- ifelse(is.na(aa$code_circo_europ),aa$id_election_candidat,aa$code_circo_europ)

aa$key <- paste(aa$nom_prenom_std,aa$election,aa$code_circo_europ_bis)
id_match(idin="my_id4f",idout="my_id4g")


aa$key <- paste(aa$nom_prenom_std,aa$code_circo_europ_bis)
id_match(idin="my_id4g",idout="my_id4h")


aa$code_circo_europ_bis <- NULL
qq4

#_____________________________________________________________
# 5 By election  ####
#_____________________________________________________________


aa$my_id5a <- NULL
aa$key <- paste(aa$nom_prenom_std,aa$election_bak)
id_match(idin="my_id4h",idout="my_id5a")

aa$election_nationale <- ifelse(aa$election %in% c("europeenne","legislative","ministere","presidentielle","senatoriale"),
                          "nationale",
                          ifelse(aa$election %in% c("maire"),
                                 "municipale",
                                 aa$election_bak))

aa$my_id5b <- NULL
aa$key <- paste(aa$nom_prenom_std,aa$election_nationale)
id_match(idin="my_id5a",idout="my_id5b")
qq4
str(aa)

#_____________________________________________________________
# 6 Mild Chaining on Different spellings  ####
#_____________________________________________________________


#_____________________________________________________________
## 6a 2nd round name correction:  nom_prenom_std_cor  ####
#_____________________________________________________________

nom_prenom_cor <- aa[aa$nom_prenom_std_cor != aa$nom_prenom_std,c("nom_prenom_std","nom_prenom_std_cor","prenom_std_cor")]
nom_prenom_cor <- unique(nom_prenom_cor)
colnames(nom_prenom_cor) <- c("nom_prenom_std","nom_prenom_std_cor2","prenom_std_cor2")
str(nom_prenom_cor)
nrow(aa)
aa$nom_prenom_std_cor2 <- NULL
aa <- merge(aa,nom_prenom_cor,by="nom_prenom_std", all.x= T)
nrow(aa)
table(is.na(aa$nom_prenom_std_cor2))

aa$nom_prenom_std_cor2 <- ifelse(is.na(aa$nom_prenom_std_cor2),
                                 aa$nom_prenom_std_cor,
                                 aa$nom_prenom_std_cor2)

aa$prenom_std_cor2 <- ifelse(is.na(aa$prenom_std_cor2),
                                 aa$prenom_std_cor,
                                 aa$prenom_std_cor2)



aa$key <- aa$nom_prenom_std_cor2

id_match(idin="my_id5b",idout="my_id6a")
nrow(aa)


#_____________________________________________________________
## 6b Chaining on nom_std prenom_1_std_cor2 ####
#_____________________________________________________________
aa$ok <- NULL

aa$prenom_1_std_cor2 <- gsub("([A-Za-z]+).*", "\\1", aa$prenom_std_cor2)
aa$nom_prenom_1_std_cor2 <- paste(aa$nom_std,aa$prenom_1_std_cor2)
aa$nbprenom <- lengths(gregexpr("\\W+", aa$prenom_std_cor2))

aa$key <- aa$nom_prenom_1_std_cor2
id_match(idin="my_id6a",idout="my_id6b")

# Checking if multiple prenom2 among Affected ids
aa[,`:=` (nb_chg=sum((my_id6b != my_id6a + alpha) & (my_id6b != my_id6a))), by=paste(my_id6b)]

# pp_m : match first names database 
pp_m <- aa[nb_chg>0,c("my_id6b","prenom_1_std_cor2","prenom_std_cor2","nom_prenom_1_std_cor2","nom_prenom_std_cor2")]
pp_m$prenom2 <- sapply(1:nrow(pp_m), function(x) gsub(pp_m$prenom_1_std_cor2[x], "", pp_m$prenom_std_cor2[x]))
pp_m[,`:=` (nb_prenom2=sum(my_id6b>0)),by=paste(my_id6b,prenom2)]
pp_m[,`:=` (nb_prenom_vide=sum(prenom2 %in% c(""," ",NA))),by=paste(my_id6b)]
pp_m[,`:=` (nb_id=sum(my_id6b>0)),by=paste(my_id6b)]

# Ok are correct matches
pp_m[,`:=` (ok=max(ifelse(prenom2 %in% c(""," ",NA)==F & (nb_prenom_vide + nb_prenom2==nb_id),1,0))),by=paste(my_id6b)]

aa$ok <- NULL
aa <- merge(aa,unique(pp_m[,c("my_id6b","ok")]),by="my_id6b",all.x = T)
aa$my_id6b <- ifelse(aa$ok %in% 0,aa$my_id6a+beta,aa$my_id6b)

qq4_fid <- length(table(aa$my_id6b))
print(qq4_fid)
qq4_fid <- data.frame(qq4_fid)
colnames(qq4_fid) <- c("n")
qq4_fid$id <- "my_id6b"
qq4 <- rbind(qq4,qq4_fid)

# Correction of duplicated identification by election
aa[,`:=` (nb_pb=sum(count)),by=paste(my_id6b,election_an_tour,date_debut_ministere)]
aa[,`:=` (max_nb_pb=max(nb_pb)),by=paste(my_id6b)]
qq5_fid <- table(aa$max_nb_pb)

print("Number of identifications with different application for the same election at the same date (pb >1)")
print(qq5_fid)


# Correction of duplicated identification by date birth
aa[,`:=` (nb_my_id_nais=sum(count)),by=paste(my_id6b,date_naissance_cor)]
aa[,`:=` (nb_my_id_nais_dif=sum(ifelse(is.na(date_naissance_cor)==T,0,1/nb_my_id_nais))),by=my_id6b]
qq6_fid <- table(aa$nb_my_id_nais_dif)

print("Number of identification with different birth dates (pb >1)")
print(qq6_fid)

qq5_fid <- data.frame(qq5_fid)
colnames(qq5_fid) <- c("Var","Freq")
qq5_fid$id <- "my_id6b"
qq5 <- rbind(qq5,qq5_fid) 

qq6_fid <- data.frame(qq6_fid)
colnames(qq6_fid) <- c("Var","Freq")
qq6_fid$id <- "my_id6b"
qq6 <- rbind(qq6,qq6_fid)

# 18 incorrect matches --> corrected by hand for 4d
# View(aa[aa$max_nb_pb>1 | nb_my_id_nais_dif>1,])
aa$idlast <- ifelse(aa$max_nb_pb>1 | aa$nb_my_id_nais_dif>1,aa$my_id4d,aa$my_id6b)


# Correction of duplicated identification by election
aa[,`:=` (nb_pb=sum(count)),by=paste(idlast,election_an_tour,date_debut_ministere)]
aa[,`:=` (max_nb_pb=max(nb_pb)),by=paste(idlast)]
qq5_fid <- table(aa$max_nb_pb)

print("Number of identifications with different application for the same election at the same date (pb >1)")
print(qq5_fid)


# Correction of duplicated identification by date birth
aa[,`:=` (nb_my_id_nais=sum(count)),by=paste(idlast,date_naissance_cor)]
aa[,`:=` (nb_my_id_nais_dif=sum(ifelse(is.na(date_naissance_cor)==T,0,1/nb_my_id_nais))),by=idlast]
qq6_fid <- table(aa$nb_my_id_nais_dif)
print("Number of identification with different birth dates (pb >1)")
print(qq6_fid)


qq4_fid <- length(table(aa$idlast))
print(qq4_fid)
qq4_fid <- data.frame(qq4_fid)
colnames(qq4_fid) <- c("n")
qq4_fid$id <- "idlast"
qq4 <- rbind(qq4,qq4_fid)


qq5_fid <- data.frame(qq5_fid)
colnames(qq5_fid) <- c("Var","Freq")
qq5_fid$id <- "idlast"
qq5 <- rbind(qq5,qq5_fid) 

qq6_fid <- data.frame(qq6_fid)
colnames(qq6_fid) <- c("Var","Freq")
qq6_fid$id <- "idlast"
qq6 <- rbind(qq6,qq6_fid)


# saveRDS(aa,"aa_tmp2.rds")

aa[,which(substr(names(aa),1,5) %in% "my_id"):= NULL]
aa[,c("nb_my_id_nais","nb_pb","max_nb_pb","nb_my_id_nais_dif","match_unit_1","match_unit","key","ok","nbprenom"):=NULL]
str(aa)

qq4 <- rbind(qq4,qq4_fid)
qq4


#--------------------------------------#
# Reappending, cleaning and save ####
#--------------------------------------#


## Final id personne ####
aa[,`:=` (id_personne_long=min(id_election_candidat_long)),by=paste(idlast)]
nrow(aa)


## More matches round 1 & 2 ####
#Checks
aa_t1 <- aa[aa$tour==1  & aa$nb_tour==2,c("idlast","tour","election_an","code_unite")]
aa_t1$tour  <- 2
aa_t1$match_t1  <- 1
aa_t2 <- aa[aa$tour==2,c("idlast","tour","election_an","code_unite")]
aa_t2t1 <- merge(aa_t2,aa_t1,by=c("idlast","election_an","tour","code_unite"),all.x=T)
qq2_idlast <- table(1-(is.na(aa_t2t1$match_t1)))
qq3_idlast <- table(aa_t2t1$election_an,is.na(aa_t2t1$match_t1))
qq2_idlast
qq3_idlast

qq2_idlast <- data.frame(qq2_idlast)
colnames(qq2_idlast) <- c("Var","Freq")
qq2_idlast$id <- "idlast"

qq3_idlast <- data.frame(qq3_idlast)
colnames(qq3_idlast) <- c("Var1","Var2","Freq")
qq3_idlast$id <- "idlast"

qq2 <- rbind.fill(qq2,qq2_idlast)
qq3 <- rbind.fill(qq3,qq3_idlast)


#Merge
aa[,`:=` (id_election_candidat_t1=max((tour==1  & nb_tour==2)*id_election_candidat)),by=paste(id_personne_long,election_an,code_unite,date_debut_ministere)]
aa$id_election_candidat_t1 <- ifelse(aa$tour!=2  | aa$id_election_candidat_t1 %in% 0,NA,aa$id_election_candidat_t1)

aa[,`:=` (id_election_candidat_t2b=max((tour==2)*id_election_candidat)),by=paste(id_personne_long,election_an,code_unite,date_debut_ministere)]
table(is.na(aa$id_election_candidat_t2b),aa$tour)
table(aa$id_election_candidat_t2b==0,aa$tour)

table(is.na(aa$id_election_candidat_t2))
aa$id_election_candidat_t2b <- ifelse(aa$tour==1 & aa$id_election_candidat_t2b != 0,aa$id_election_candidat_t2b,NA)
aa$id_election_candidat_t2 <- ifelse(is.na(aa$id_election_candidat_t2),
                                     aa$id_election_candidat_t2b,
                                     aa$id_election_candidat_t2)
table(is.na(aa$id_election_candidat_t2))
aa$elu_t1 <- NULL

ss_t2b <- aa[aa$tour==2 & is.na(aa$id_election_candidat_t1)==F,]
nrow(ss_t2b)

ss_t2b$id_personne_long <- NULL

colnames(ss_t2b)

ss_t2c <- rbind.fill(ss_t2,ss_t2b)
colnames(ss_t2c)


## Appending the two rounds #####

# Second round checks
# Check duplicates
ss_t2d <- merge(ss_t2c,aa[aa$tour == 1 &  aa$nb_tour==2,c("id_election_candidat_t2","id_personne_long")],
                by.x="id_election_candidat",
                by.y="id_election_candidat_t2",all.x=T)
table(is.na(ss_t2d$id_personne_long))

aa <- aa[(aa$tour==2 & is.na(aa$id_election_candidat_t1)==F)==F,]
nrow(aa)
nrow(aa)+nrow(ss_t2d)


ss_t2d$t1b <- NULL

aa <- rbind.fill(aa,ss_t2d)
aa <- data.table(aa)
nrow(aa)
rm(ss_t2b,ss_t2c,ss_t2d)
str(aa)

table(is.na(aa$id_personne_long))
length(table(aa$id_personne_long))
colnames(aa)
table(aa$tour,useNA = "ifany")

aa_t1 <- aa[aa$tour==1  & aa$nb_tour==2,c("id_personne_long","tour","election_an","code_unite","id_election_candidat_t2")]
aa_t1$tour  <- 2
aa_t1$match_t1  <- 1
aa_t2 <- aa[aa$tour==2 ,c("id_personne_long","tour","election_an","code_unite","id_election_candidat_t1")]
aa_t2t1 <- merge(aa_t2,aa_t1,by=c("id_personne_long","election_an","code_unite","tour"),all.x=T)
qq2_idpers <- table(1-(is.na(aa_t2t1$match_t1)))
qq3_idpers <- table(aa_t2t1$election_an,is.na(aa_t2t1$match_t1))
qq2_idpers
qq2_idpers <- data.frame(qq2_idpers)
colnames(qq2_idpers) <- c("Var","Freq")
qq2_idpers$id <- "idpers"

qq3_idpers <- data.frame(qq3_idpers)
colnames(qq3_idpers) <- c("Var1","Var2","Freq")
qq3_idpers$id <- "idpers"

qq2 <- rbind.fill(qq2,qq2_idpers)
qq3 <- rbind.fill(qq3,qq3_idpers)


aa$election <- aa$election_bak
aa$election_an <- aa$election_an_bak
aa$election_an_tour <- aa$election_an_tour_bak


# We redo some  t1 t2 variables  variables
aa[,`:=` (elu_t1=max((tour==1)*elu_ce_tour)),by=paste(id_personne_long,election_an,date_debut_ministere)]
aa[,`:=` (elu_t2=max((tour==2)*elu_ce_tour)),by=paste(id_personne_long,election_an,date_debut_ministere)]
table(aa$elu_t2,aa$elu_t1,useNA="ifany")
aa$elu_t2 <- ifelse(aa$nb_tour ==1  ,NA,aa$elu_t2)
table(aa$elu_t2,aa$tour)
aa$elu_in_fine <- ifelse(aa$tour==9,NA,( aa$elu_t1 %in% 1 | aa$elu_t2 %in% 1 | aa$elu_ce_tour %in% 1) *1)

table(aa$tour,aa$elu_in_fine,useNA = "ifany")
table(aa$tour,aa$elu_t1,useNA = "ifany")
table(aa$tour,aa$elu_t2,useNA = "ifany")
table(aa$election_an_tour,aa$elu_in_fine,useNA = "ifany")


aa$id_personne <- 10^7 + as.numeric(as.factor(aa$id_personne_long))



qq4_fid <- length(table(aa$id_personne))
print(qq4_fid)
qq4_fid <- data.frame(qq4_fid)
colnames(qq4_fid) <- c("n")
qq4_fid$id <- "id_personne"
qq4 <- rbind(qq4,qq4_fid)

gc()

# Selection of variables
bb <- aa [,list(id_election_candidat,
                    id_election_candidat_long,
                    id_personne,
                    id_personne_long,
                    id_fichier,
                    election,
                    scrutin,
                    date,
                    annee,
                    nb_tour,
                    tour,
                    election_an,
                    election_an_tour,
                    code_circo_europ,
                    code_region_1982,
                    code_region_2015,
                    code_departement,
                    code_circo_leg,
                    code_canton,
                    code_commune,
                    code_commune_section,
                    code_unite,
                    libelle_unite,
                    suppleant,
                    id_nom_liste,
                    nom,
                    prenom,
                    nom_std,
                    prenom_std,
                    nom_prenom_std,
                    nom_prenom_std_cor,
                    date_naissance,
                    date_naissance_cor,
                    sexe,
                    profession,
                    code_csp,
                    code_csp_std,
                    sortant,
                    nuance,
                    inscrits,
                    votants,
                    abstentions,
                    exprimes,
                    voix,
                    intitule_liste,
                    sexe_tete_liste,
                    nom_tete_liste,
                    prenom_tete_liste,
                    rang_liste,
                    nb_liste,
                    max_rang_liste,
                    sieges_liste,
                    elu_ce_tour,
                    elu_t1,
                    elu_t2,
                    elu_in_fine,
                    id_election_candidat_t1,
                    id_election_candidat_t2,
                    date_debut_ministere)]

bb <- bb[order(bb$id_personne,bb$date),]
rownames(bb) <- NULL

## Final Export ####
str(bb)
bb <- as.data.frame(bb)
str(bb)
saveRDS(bb,"base_candidatEs_id_v2.rds")
# write.csv(bb,"base_candidatEs_id_v2.csv",na='', row.names=FALSE,fileEncoding = "UTF-8")

## Quality statistics ####
write.csv(qq1,"qq1.csv")
write.csv(qq2,"qq2.csv")
write.csv(qq3,"qq3.csv")
write.csv(qq4,"qq4.csv")
write.csv(qq5,"qq5.csv")
write.csv(qq6,"qq6.csv")


table(bb$election_an_tour[bb$tour==2], is.na(bb$id_election_candidat_t1)[bb$tour==2])
addmargins(table(bb$election[bb$tour==2], is.na(bb$id_election_candidat_t1[bb$tour==2])))


#-------------------------------#
# Cleaning ####
#-------------------------------#

rm(list=ls())
gc()
#-------------------------------#
# End ####
#-------------------------------#