diff --git a/DESCRIPTION b/DESCRIPTION index 676292e..3379f2d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,29 +1,28 @@ -Package: correspondenceTables -Type: Package -Title: Creating Correspondence Tables Between Two Statistical Classifications -Date: 2022-09-30 -Version: 0.9.9 -Authors@R: c(person("Vasilis", "Chasiotis", role = "aut", comment="Department of Statistics, Athens University of Economics and Business"), - person("Photis", "Stavropoulos", role = "aut", comment="Quantos S.A. Statistics and Information Systems"), - person("Martin", "Karlberg", role = "aut"), - person("Mátyás", "Mészáros", email = "matyas.meszaros@ec.europa.eu", role = "cre"), - person("Martina", "Patone", role = "aut"), - person("Erkand", "Muraku", role = "aut"), - person("Clement", "Thomas", role = "aut"), - person("Loic", "Bienvenu", role = "aut")) -Description: - A candidate correspondence table between two classifications can be created when there are correspondence tables leading from the first classification to the second one via intermediate 'pivot' classifications. - The correspondence table between two statistical classifications can be updated when one of the classifications gets updated to a new version. -License: EUPL -Encoding: UTF-8 -Imports: data.table, httr, tidyverse, writexl, stringr, igraph, tools -Suggests: - knitr, - rmarkdown, - tinytest -VignetteBuilder: knitr -NeedsCompilation: no -URL: https://github.com/eurostat/correspondenceTables -BugReports: https://github.com/eurostat/correspondenceTables/issues -Maintainer: Mátyás Mészáros -RoxygenNote: 7.2.3 +Package: correspondenceTables +Type: Package +Title: Creating Correspondence Tables Between Two Statistical Classifications +Date: 2022-09-30 +Version: 0.10.10 +Authors@R: c(person("Vasilis", "Chasiotis", role = "aut", comment="Department of Statistics, Athens University of Economics and Business"), + person("Photis", "Stavropoulos", role = "aut", comment="Quantos S.A. Statistics and Information Systems"), + person("Martin", "Karlberg", role = "aut"), + person("Mátyás", "Mészáros", email = "matyas.meszaros@ec.europa.eu", role = "cre"), + person("Martina", "Patone", role = "aut"), + person("Erkand", "Muraku", role = "aut"), + person("Clement", "Thomas", role = "aut"), + person("Loic", "Bienvenu", role = "aut")) +Description: + A candidate correspondence table between two classifications can be created when there are correspondence tables leading from the first classification to the second one via intermediate 'pivot' classifications. + The correspondence table between two statistical classifications can be updated when one of the classifications gets updated to a new version. +License: EUPL +Encoding: UTF-8 +Imports: data.table, httr, stringr, igraph, tools +Suggests: + knitr, + rmarkdown, tinytest +VignetteBuilder: knitr +NeedsCompilation: no +URL: https://github.com/eurostat/correspondenceTables +BugReports: https://github.com/eurostat/correspondenceTables/issues +Maintainer: Mátyás Mészáros +RoxygenNote: 7.2.3 diff --git a/R/Check_n_columns.R b/R/Check_n_columns.R new file mode 100644 index 0000000..2c8ba8e --- /dev/null +++ b/R/Check_n_columns.R @@ -0,0 +1,16 @@ +check_n_columns <- function(df, source, num_columns) { + caller <- sys.call(-1) #define the caller function + tryCatch({ + if (ncol(df) < num_columns) { + stop(paste("In",as.character(caller[1]), ", the data", source, " has less than the minimum required ", num_columns, "columns.")) + } else if (ncol(df) > num_columns) { + warning(paste("In", as.character(caller[1]),", the data", source, " has more than", num_columns, "columns.")) + } else { + #print(paste("The data", source, " has exactly", num_columns, "columns.")) + } + }, error = function(e) { + print(paste("Error:", e$message)) + }, warning = function(e) { + print(paste("Warning:", e$message)) + }) +} \ No newline at end of file diff --git a/R/CsvFileSave.R b/R/CsvFileSave.R new file mode 100644 index 0000000..a00f0c1 --- /dev/null +++ b/R/CsvFileSave.R @@ -0,0 +1,31 @@ +CsvFileSave <- function(CSVpath, OutputDF) { + if (!is.null(CSVpath)) { + + # Check if the file exists and prompt for overwrite confirmation + if (file.exists(CSVpath)) { + cat("A CSV file with the same name already exists.\n") + cat("Warning: This action will overwrite the existing file.\n") + + proceed <- "" + while (!(proceed %in% c("y", "n"))) { + proceed <- tolower(readline("Do you want to proceed? (y/n): ")) + if (!(proceed %in% c("y", "n"))) { + cat("Invalid input. Please enter 'y' or 'n'.\n") + } + } + + if (proceed != "y") { + cat("Operation aborted.\n") + return(NULL) + } + } + # Try to write the CSV file with error handling + tryCatch({ + write.csv(OutputDF, CSVpath, row.names = FALSE) + cat("The table was saved in", CSVpath, "\n") + }, error = function(e) { + cat("An error occurred while writing to the file:\n") + cat(e$message, "\n") + }) + } +} diff --git a/R/aggregateCorrespondenceTable.R b/R/aggregateCorrespondenceTable.R index 8f34703..8292653 100644 --- a/R/aggregateCorrespondenceTable.R +++ b/R/aggregateCorrespondenceTable.R @@ -16,65 +16,122 @@ #' B <- (system.file("extdata", "b_data.csv", package = "correspondenceTables")) #' #' -#' result <- aggregateCorrespondenceTable(AB = AB, A = A, B = B, CSVout = FALSE) +#' result <- aggregateCorrespondenceTable(AB = AB, A = A, B = B, CSVout = NULL) #' print(result) #' aggregateCorrespondenceTable <- function(AB, A, B, CSVout = NULL ) { - # Check if input files exist - if (!file.exists(AB) || !file.exists(A) || !file.exists(B)) { - stop("One or more input files do not exist.") - } - # Check if the input files are csv - if (!grepl("\\.csv$", AB) || !grepl("\\.csv$", A) || !grepl("\\.csv$", B)) { - stop("One or more input files do not have a .csv extension.") - } - # Read the input correspondence table AB - ab_data <- read.csv2(AB, header = TRUE, sep =",") - ColumnNames <- colnames(ab_data)[1:2] - colnames(ab_data)[1:2] = c("Acode","Bcode") - # Check if AB table has the required columns - if (!all(c("Acode", "Bcode") %in% colnames(ab_data))) { - stop("Input correspondence table AB must have columns 'Acode' and 'Bcode'.") - } + ab_data <- testInputTable("Correspondence table (AB)", AB) + a_data <- testInputTable("Source classification (A)", A) + b_data <- testInputTable("Target classification (B)", B) - # Check for duplicate combinations of Acode and Bcode in AB - if (any(duplicated(ab_data[c("Acode", "Bcode")]))) { - stop("Duplicate combinations of Acode and Bcode found in the input correspondence table AB.") - } +#Check if required number of columns are present in each input +check_n_columns(ab_data,"Correspondence table (AB)", 2) +check_n_columns(a_data, "Source classification (A)", 3) +check_n_columns(b_data,"Target classification (B)", 3) - # Filter out records with missing Acode or Bcode - ab_data <- ab_data[!is.na(ab_data$Acode) & !is.na(ab_data$Bcode), ] - - # Check if there are any records left - if (nrow(ab_data) == 0) { - stop("No valid records found in the input correspondence table AB.") - } + # Read the input correspondence table AB + # ab_data <- read.csv2(AB, header = TRUE, sep =",") + ColumnNames_ab <- colnames(ab_data)[1:2] + colnames(ab_data)[1:2] = c("Acode","Bcode") + + # Check if there are any records + tryCatch( + { + if (nrow(ab_data) == 0) { + stop("No valid records found in the input correspondence table AB.") + } + }, error = function(e) { + message("Error in aggregateCorrespondenceTable: ",conditionMessage(e)) + stop(e) + }) + + # Find duplicated combinations of Acode and Bcode in AB + duplicated_rows <- ab_data[duplicated(ab_data[c("Acode", "Bcode")]), c("Acode", "Bcode")] + tryCatch( + { + # Check for duplicate combinations of Acode and Bcode + if (nrow(duplicated_rows) > 0) { + stop("Please remove duplicate(s) combinations of Acode and Bcode from the input correspondence table AB.") + } + }, error = function(e) { + message("Error in aggregateCorrespondenceTable:",conditionMessage(e)) + print(duplicated_rows) + stop(e) + }) + + + # Filter rows where Acode or Bcode is missing in the AB data + missing_code_rows <- ab_data[is.na(ab_data$Acode) | ab_data$Acode == "" | is.na(ab_data$Bcode) | ab_data$Bcode == "", ] + tryCatch( + { + # Display problematic rows + if (nrow(missing_code_rows) > 0) { + stop(paste("Rows with missing values in the", ColumnNames_ab[1], "or", ColumnNames_ab[2], "column of the AB data:")) + } + }, error = function(e) { + message("Error in aggregateCorrespondenceTable: ",conditionMessage(e)) + print(missing_code_rows) + stop(e) + }) + ######## ####Read the source classification table A - a_data <- read.csv(A, header = TRUE, sep = ",") + + ColumnNames_a <- colnames(a_data)[1:3] colnames(a_data)[1:3] = c("Acode","Alevel","Asuperior") - # Check if A table has the required columns - if (!all(c("Acode", "Alevel", "Asuperior") %in% colnames(a_data))) { - stop("Source classification table A must have columns 'Acode', 'Alevel', and 'Asuperior'.") - } - #Uniqueness 3.2 Check for duplicate Acode values in A - if (any(duplicated(a_data$Acode))) { - stop("Duplicate Acode values found in the source classification table A.") + # Check if there are records in table A + tryCatch({ + if (nrow(a_data) == 0) { + stop("No valid records found in the input correspondence table A.") + } + }, error = function(e) { + message("Error in aggregateCorrespondenceTable while processing input correspondence table A\n",conditionMessage(e)) + stop(e) + }) + + # Filter rows where there are NA or empty values in the Alevel column + problematic_rows <- a_data[is.na(a_data$Alevel) | a_data$Alevel == "", ] + + # Display problematic rows + if (nrow(problematic_rows) > 0) { + print(paste("Rows with missing or empty values in the Alevel column:", ColumnNames_a[2])) + print(problematic_rows) + cat("\n") } - # Check if Alevel is numeric - if (!all(is.numeric(a_data$Alevel))) { - stop("Alevel column in the source classification table A must contain numeric values.") + # Check for duplicate Acode values in table A + Aduplicated_rows <- a_data[duplicated(a_data$Acode), "Acode"] + if (length(Aduplicated_rows) > 0) { + message(paste("Duplicate(s) value(s) of Acode column named:", ColumnNames_a[1], "found in the input table A:")) + print(Aduplicated_rows) + stop(paste("Please remove duplicate(s) values of Acode column named:", ColumnNames_a[1], "in the input table A.")) + } else { + # print("No duplicate(s) value(s) of Acode in the input table A.") } - # Check if Asuperior is a character or blank - if (!all(is.character(a_data$Asuperior) | a_data$Alevel == 1)) { - stop("Asuperior column in the source classification table A must contain characters or be blank for records at level 1.") + + # Identify rows with text in Asuperior for level 1 records + a_level_1_with_text <- a_data[a_data$Alevel == 1 & !is.na(a_data$Asuperior) & a_data$Asuperior != "", ] + + # Display rows with text in Asuperior for level 1 records + if (nrow(a_level_1_with_text) > 0) { + message(paste("In the source classification table, the following records at level 1 have text in the Asuperior column:", ColumnNames_a[3])) + print(a_level_1_with_text) + stop() } + # Check if Asuperior is a character or blank for records at level 1 + tryCatch({ + if (!all((is.character(a_data$Asuperior) & a_data$Alevel != 1) | (is.na(a_data$Asuperior) & a_data$Alevel == 1))) { + stop(paste("Asuperior column,", ColumnNames_a[3], "in the source classification table A must be blank for records at level 1.")) + } + }, error = function(e) { + stop(e) + }) + # Initialize the variable to store the current level for A mostGranularA <- max(a_data$Alevel) currentLevelA <- mostGranularA @@ -86,84 +143,142 @@ aggregateCorrespondenceTable <- function(AB, A, B, CSVout = NULL ) { AiMinus1 <- a_data[a_data$Alevel == (currentLevelA - 1), ] # Check if all values of Asuperior (in Ai) correspond to values of Acode (in AiMinus1) - error_rows <- which(!(Ai$Asuperior %in% AiMinus1$Acode)) - if (length(error_rows) > 0) { - cat("Hierarchy error in A-data at level:", currentLevelA, "\n") - cat("Error occurred in rows:", error_rows, "\n") - break # Exit the loop if an error is detected - } + error_occurence <- which(!(Ai$Asuperior %in% AiMinus1$Acode)) + error_rows <- Ai[error_occurence,] + if (length(error_occurence) > 0) { + cat("Hierarchy error in A_data at level:", currentLevelA, "\n") + cat("For the specified level, error at occurence:", error_occurence, "\n") + cat("Offending row:\n") + print(error_rows) + stop("Hierarchy error detected in A_data.") + } # Check if all values of Acode (in AiMinus1) correspond to values of Asuperior (in Ai) - error_rows <- which(!(AiMinus1$Acode %in% Ai$Asuperior)) - if (length(error_rows) > 0) { + error_occurence <- which(!(AiMinus1$Acode %in% Ai$Asuperior)) + error_rows <- AiMinus1[error_occurence,] + if (length(error_occurence) > 0) { cat("Hierarchy error in A-data at level:", currentLevelA - 1, "\n") - cat("Error occurred in rows:", error_rows, "\n") - break # Exit the loop if an error is detected + cat("For the specified level, error at occurence:", error_occurence, "\n") + cat("Offending row:\n" ) + print(error_rows) + stop("Hierarchy error detected in A_data.") } # Move to the next level currentLevelA <- currentLevelA - 1 } + # Read the target classification table B - b_data <- read.csv(B, header = TRUE, sep = ",") + ColumnNames_b <- colnames(b_data)[1:3] colnames(b_data)[1:3] = c("Bcode","Blevel","Bsuperior") - # Check if B table has the required columns - if (!all(c("Bcode", "Blevel", "Bsuperior") %in% colnames(b_data))) { - stop("Target classification table B must have columns 'Bcode', 'Blevel', and 'Bsuperior'.") + + + # Check if there are any records left in table B + if (nrow(b_data) == 0) { + stop("No valid records found in the input correspondence table B.") } - #Uniqueness 3.2 Check for duplicate Bcode values in B - if (any(duplicated(b_data$Bcode))) { - stop("Duplicate Bcode values found in the target classification table B.") + # Filter rows where there are NA or empty values in the Blevel column + problematic_rows <- b_data[is.na(b_data$Blevel) | b_data$Blevel == "", ] + + # Display problematic rows + if (nrow(problematic_rows) > 0) { + print(paste("Rows with missing or empty values in the Blevel column:", ColumnNames_b[2])) + print(problematic_rows) + cat("\n") } - # Check if Blevel is numeric in B - if (!all(is.numeric(b_data$Blevel))) { - stop("Blevel column in the target classification table B must contain numeric values.") + # Check for duplicate Bcode values in table B + Bduplicated_rows <- b_data[duplicated(b_data$Bcode), "Bcode"] + if (length(Bduplicated_rows) > 0) { + message(paste("Duplicate(s) value(s) of Bcode column named:", ColumnNames_b[1], "found in the input table B :")) + print(Bduplicated_rows) + stop(paste("Please remove duplicate(s) value(s) of Bcode column named:", ColumnNames_b[1],"in the input table B .")) + } else { + # print("No duplicate(s) value(s) of Bcode in the input table B .") } - # Check if Bsuperior is a character or blank in B - if (!all(is.character(b_data$Bsuperior) | b_data$Blevel == 1)) { - stop("Bsuperior column in the target classification table B must contain characters or be blank for records at level 1.") + + # Identify rows with text in Bsuperior for level 1 records + b_level_1_with_text <- b_data[b_data$Blevel == 1 & !is.na(b_data$Bsuperior) & b_data$Bsuperior != "", ] + + # Display rows with text in Bsuperior for level 1 records + if (nrow(b_level_1_with_text) > 0) { + message(paste("Bsuperior column,", ColumnNames_b[3], "in the target classification table B must be blank for records at level 1.")) + print(b_level_1_with_text) + stop() } + # Check if Bsuperior is a character or blank for records at level 1 + tryCatch({ + if (!all((is.character(b_data$Bsuperior) & b_data$Blevel != 1) | (is.na(b_data$Bsuperior) & b_data$Blevel == 1))) { + stop(paste("Bsuperior column,", ColumnNames_b[3], "in the source classification table B must contain characters or be blank for records at level 1.")) + } + }, error = function(e) { + stop(e) + }) + # Initialize the variable to store the current level mostGranularB <- max(b_data$Blevel) currentLevelB <- mostGranularB - # Loop to check hierarchy at each level for B while (currentLevelB >= 2) { # Select rows at the current level and the level below Bi <- b_data[b_data$Blevel == currentLevelB, ] BiMinus1 <- b_data[b_data$Blevel == (currentLevelB - 1), ] # Check if all values of Bsuperior (in Bi) correspond to values of Bcode (in BiMinus1) - error_rows <- which(!(Bi$Bsuperior %in% BiMinus1$Bcode)) - if (length(error_rows) > 0) { - cat("Hierarchy error in B-data at level:", currentLevelB, "\n") - cat("Error occurred in rows:", error_rows, "\n") - break # Exit the loop if an error is detected - } + error_occurence <- which(!(Bi$Bsuperior %in% BiMinus1$Bcode)) + error_rows <- Bi[error_occurence,] + if (length(error_occurence) > 0) { + cat("Hierarchy error in B_data at level:", currentLevelB, "\n") + cat("For the specified level, error at occurence:", error_occurence, "\n") + cat("Offending row:\n" ) + print(error_rows) + stop("Hierarchy error detected in B_data.") + } # Check if all values of Bcode (in BiMinus1) correspond to values of Bsuperior (in Bi) - error_rows <- which(!(BiMinus1$Bcode %in% Bi$Bsuperior)) - if (length(error_rows) > 0) { + error_occurence <- which(!(BiMinus1$Bcode %in% Bi$Bsuperior)) + error_rows <- BiMinus1[error_occurence,] + if (length(error_occurence) > 0) { cat("Hierarchy error in B_data at level:", currentLevelB - 1, "\n") - cat("Error occurred in rows:", error_rows, "\n") - break # Exit the loop if an error is detected + cat("For the specified level, error at occurence:", error_occurence, "\n") + cat("Offending row:\n" ) + print(error_rows) + stop("Hierarchy error detected in B_data.") } # Move to the next level currentLevelB <- currentLevelB - 1 } - #Uniqueness Check if Acode and Bcode in AB exist in A and B respectively - if (!all(ab_data$Acode %in% a_data$Acode) || !all(ab_data$Bcode %in% b_data$Bcode)) { - stop("Acode or Bcode in the input correspondence table does not exist in source or target classification table.") + # Uniqueness Check if Acode and Bcode in AB exist in A and B respectively + if (!all(ab_data$Acode %in% a_data$Acode)) { + offending_Acodes <- ab_data$Acode[!ab_data$Acode %in% a_data$Acode] + tryCatch( + stop(paste("Acode in the input correspondence table does not exist in source classification table. Offending Acodes:", paste(offending_Acodes, collapse = ", "))), + error = function(e) { + cat("Error:", e$message, "\n") + stop(e) + } + ) + } else if (!all(ab_data$Bcode %in% b_data$Bcode)) { + offending_Bcodes <- ab_data$Bcode[!ab_data$Bcode %in% b_data$Bcode] + tryCatch( + stop(paste("Bcode in the input correspondence table does not exist in target classification table. Offending Bcodes:", paste(offending_Bcodes, collapse = ", "))), + error = function(e) { + cat("Error:", e$message, "\n") + stop(e) + } + ) } - ###3.4 Correct and complete correspondences + + + + ###3.4 Correct and complete correspondences ###add additional the column because here you just add the code you need all the column AmostGranular <- subset(a_data, Alevel == max(Alevel), select = c(Acode, Asuperior)) @@ -171,12 +286,13 @@ aggregateCorrespondenceTable <- function(AB, A, B, CSVout = NULL ) { AB_mostGranular <- merge(AmostGranular, BmostGranular, by.x = "Acode", by.y = "Bcode") - if (!(all(ab_data$Acode %in% AmostGranular$Acode) && all(ab_data$Bcode %in% BmostGranular$Bcode))) { - stop("Acode or Bcode in the input correspondence table does not exist in source or target classification table.") + if (!(all(AB_mostGranular$Acode %in% ab_data$Acode))) { + offending_Acodes <- AB_mostGranular$Acode[!AB_mostGranular$Acode %in% ab_data$Acode] + stop(paste("Acode in the most granular correspondence table does not exist in the input correspondence table. Offending Acodes:", paste(offending_Acodes, collapse = ", "))) } - - if (!(all(AB_mostGranular$Acode %in% ab_data$Acode) && all(AB_mostGranular$Bcode %in% ab_data$Bcode))) { - stop("Acode or Bcode in the most granular correspondence table does not exist in the input correspondence table.") + if (!(all(AB_mostGranular$Bcode %in% ab_data$Bcode))) { + offending_Bcodes <- AB_mostGranular$Bcode[!AB_mostGranular$Bcode %in% ab_data$Bcode] + stop(paste("Bcode in the most granular correspondence table does not exist in the input correspondence table. Offending Bcodes:", paste(offending_Bcodes, collapse = ", "))) } ########## 4.1 Creation of the table and merge it. @@ -188,7 +304,7 @@ aggregateCorrespondenceTable <- function(AB, A, B, CSVout = NULL ) { level_data <- subset(a_data, a_data$Alevel == i, select = c(Acode, Asuperior)) A_levels[[i]] <- level_data } - + # Create an empty data frame to store the final result for A resultA <- data.frame() @@ -208,7 +324,7 @@ aggregateCorrespondenceTable <- function(AB, A, B, CSVout = NULL ) { } # Result will contain the final aggregated correspondence table with hierarchical code columns for A - resultA$test <- resultA[[paste0("Acode", mostGranularA)]] + resultA$granular <- resultA[[paste0("Acode", mostGranularA)]] resultA$Asuperior <- NULL # Determine the most granular level dynamically @@ -242,14 +358,17 @@ aggregateCorrespondenceTable <- function(AB, A, B, CSVout = NULL ) { } # Result will contain the final aggregated correspondence table with hierarchical code columns for B - resultB$test <- resultB[[paste0("Bcode", mostGranularB)]] + resultB$granular <- resultB[[paste0("Bcode", mostGranularB)]] resultB$Bsuperior <- NULL - # Merge resultA and resultB using the 'test' column as the key - Merged_AB <- merge(resultA, resultB, by.x = "test", by.y = "test", all = F) - Merged_AB$test <- NULL - + # Merge resultA and resultB using the 'granular' column as the key + Merged_resultA_AB <- merge(resultA, ab_data, by.x = "granular", by.y= "Acode", all= F) + Merged_AB <- merge(Merged_resultA_AB, resultB, by.x= "Bcode", by.y= "granular", all=F) + names(Merged_AB)[names(Merged_AB) == "Acode"] <- "Acode1" + names(Merged_AB)[names(Merged_AB) == "Bcode.y"] <- "Bcode1" + Merged_AB$Bcode <- NULL + Merged_AB$granular <- NULL ##Table merged final_result <-Merged_AB @@ -289,7 +408,9 @@ aggregateCorrespondenceTable <- function(AB, A, B, CSVout = NULL ) { } # Convert Matrices to Dataframe - results_df <- as.data.frame(do.call(rbind, results_matrices)) + + results_df <- suppressWarnings(as.data.frame(do.call(rbind, results_matrices))) + # Extract levels from Acode and Bcode columns acode_levels <- gsub("^Acode(\\d+)$", "\\1", acode_columns) @@ -298,28 +419,22 @@ aggregateCorrespondenceTable <- function(AB, A, B, CSVout = NULL ) { acode_levels <- rep(acode_levels, length.out = max_levels) bcode_levels <- rep(bcode_levels, length.out = max_levels) # Define the new column names - new_colnames <- c(paste0(ColumnNames[1]," level"), - paste0(ColumnNames[2]," level"), - ColumnNames[1], - ColumnNames[2], - paste0("N of ", ColumnNames[1], acode_levels, " level values "), - paste0("N of ", ColumnNames[2] , bcode_levels , " level values ")) + new_colnames <- c(paste0(ColumnNames_ab[1]," level"), + paste0(ColumnNames_ab[2]," level"), + ColumnNames_ab[1], + ColumnNames_ab[2], + paste("N of", ColumnNames_ab[1], "level", acode_levels, "values"), + paste("N of", ColumnNames_ab[2] , "level", bcode_levels , "values")) # Update column names in results_df colnames(results_df) <- new_colnames - # Display Results - if (!is.null(CSVout)) { - if (is.logical(CSVout) && CSVout == TRUE) { - file_name <- paste0("AgrgregateCorrespondeceTable_", ColumnNames[1], " & ",ColumnNames[2], ".csv") - path_file <- file.path(getwd(), file_name) - write.csv(results_df, path_file, row.names = FALSE) - message(paste0("The table was saved in ", getwd(), file_name)) - } else if (is.character(CSVout)) { - write.csv(results_df, CSVout, row.names = FALSE) - } - } - return(results_df) + # Display Results + # Using the testCsvParameter function to validate CSVoutz + testCsvParameter("CSV", CSVout) + + CsvFileSave(CSVout, results_df) + +return(results_df) } - diff --git a/R/analyseCorrespondancetable.R b/R/analyseCorrespondancetable.R index e0ec721..5c96b6b 100644 --- a/R/analyseCorrespondancetable.R +++ b/R/analyseCorrespondancetable.R @@ -1,335 +1,432 @@ -#' @title analyseCorrespondenceTable performs analysis on correspondence tables -#' @description The `analyseCorrespondenceTable` function takes input correspondence tables (AB) and related data (A and B) to perform analysis and generate various statistics. -#' It checks the validity of the input data, identifies components, calculates correspondence types, and creates summary tables. -#' @param AB a mandatory argument containing a CSV file provide by the user contains the correspondence table data with columns "Acode" and "Bcode". -#' @param A a path to a CSV file containing source classification data with "Acode" column. -#' @param formatA A regular expression pattern to filter source classification data based on "Acode" should contains start & end position. -#' @param B a path to a CSV file containing target classification data with "Bcode" column. -#' @param formatB A regular expression pattern to filter target classification data based on "Bcode" should contains start & end position. -#' @param CSVcorrespondenceInventory The valid values are not NULL if the user put a path with a empty csv file it will return it with the correspondeceInventory or just a path with a csv file . By default no CSV is produce -#' @param CSVcorrespondenceAnalysis Provide an output containing the correpondenceAnalysis. the user put a path a empty file it will return with correpondenceAnalysis. by default no CSV is produce -#' @importFrom igraph graph.data.frame decompose.graph -#' @import igraph -#' -#' @return A list containing two data frames: Annexe_A and Annexe_B. -#' The `CSVcorrespondenceInventory` contains statistics related to components, correspondence types, and source/target positions. -#' The `CSVcorrespondenceAnalysis` contains statistics for each class in the correspondence table. -#' -#' @export -#' @examples -#' # Use data from the folder extdata -#' -#' -#' -#' -#' -#' -#' -#' -#' # Perform analysis -#' result <- analyseCorrespondenceTable(AB =system.file("extdata", "ExempleAnnexe.csv", package = "correspondenceTables"),A = NULL, formatA = NULL, B = NULL, formatB = NULL, CSVcorrespondenceInventory = NULL, CSVcorrespondenceAnalysis = NULL) -#' print(result$Annexe_A) -#' print(result$Annexe_B) - - - -analyseCorrespondenceTable <- function(AB, A = NULL, formatA = NULL, B = NULL, formatB = NULL, - CSVcorrespondenceInventory = NULL, CSVcorrespondenceAnalysis = NULL) { - - if (class(AB) == "character") { - # If AB is a character string, assume it's a path to a CSV file - input_file_path <- AB - - AB <- read.csv2(input_file_path, header = TRUE, sep = ",") - } else if (class(AB) == "data.frame") {} else { - stop("Parameter AB must be a path to a CSV file") - } - ColumnNames <- colnames(AB)[1:2] - colnames(AB)[1:2] = c("Acode", "Bcode") - # Check if AB file has required columns - if (!("Acode" %in% colnames(AB)) || !("Bcode" %in% colnames(AB))) { - stop("The AB file must contain 'Acode' and 'Bcode' columns.") - } - - # Filter out records with missing Acode or Bcode - AB <- AB[complete.cases(AB[c("Acode", "Bcode")]), ] - - # Check if there are any valid records after filtering - if (nrow(AB) == 0) { - stop("No valid records found in the AB file.") - } - - # Check uniqueness of code pairs in AB file - duplicate_pairs <- duplicated(AB[c("Acode", "Bcode")]) | duplicated(AB[c("Acode", "Bcode")], fromLast = TRUE) - if (any(duplicate_pairs)) { - first_duplicate <- AB[duplicate_pairs, c("Acode", "Bcode")][1, ] - stop(paste("Duplicate code pair found in AB file:", first_duplicate$Acode, "-", first_duplicate$Bcode)) - } - - # Read A file if provided - if (!is.null(A)) { - a_data <- read.csv(A, header = TRUE) - colnames(a_data)[1:1] = c("Acode") - - # Check if A file has the required column - if (!("Acode" %in% colnames(a_data))) { - stop("The A file must contain 'Acode' column.") - } - - # Filter out A records based on formatA, if specified - if (!is.null(formatA)) {# Check if formatA is numeric vector with two elements - if (!is.numeric(formatA) || length(formatA) != 2) { - stop("formatA must be a numeric vector with two elements.") - } - - # Convert Acode to character and filter based on end position - a_data$Acode <- as.character(a_data$Acode) - a_data <- a_data[nchar(a_data$Acode) == formatA[2], ] - - # Check if there are any valid records after end position filtering - if (nrow(a_data) == 0) { - stop("No valid records found in the A file after applying end position filter.") - } - } - - # Check uniqueness of A codes - duplicate_a_codes <- duplicated(a_data$Acode) | duplicated(a_data$Acode, fromLast = TRUE) - if (any(duplicate_a_codes)) { - first_duplicate <- a_data[duplicate_a_codes, "Acode"][1] - stop(paste("Duplicate Acode found in A file:", first_duplicate)) - } - - # Find unmatched source classification codes - unmatched_codes_A <- setdiff(a_data$Acode, AB$Acode) - noCorrespondenceA <- a_data[a_data$Acode %in% unmatched_codes_A, ] - noClassificationA <- AB[AB$Acode %in% unmatched_codes_A, ] - - # Print the length of noCorrespondenceA or a message indicating all codes in A are covered - if (nrow(noCorrespondenceA) > 0) { - cat("Number of unmatched source classification codes in A:", nrow(noCorrespondenceA), "\n") - } else { - cat("All source classification codes in A are covered in the correspondence table.\n") - } - - # Print the length of noClassificationA or a message indicating all codes in the correspondence table are covered - if (nrow(noClassificationA) > 0) { - cat("Number of source classification codes in AB not found in A:", nrow(noClassificationA), "\n") - } else { - cat("All source classification codes in the correspondence table are covered by A.\n") - } - } - - # Read B file if provided - if (!is.null(B)) { - b_data <- read.csv(B, header = TRUE) - colnames(b_data)[1:1] = c("Bcode") - - # Check if B file has the required column - if (!("Bcode" %in% colnames(b_data))) { - stop("The B file must contain 'Bcode' column.") - } - - # Filter out B records based on formatB, if specified - if (!is.null(formatB)) { - # Check if formatB is numeric vector with two elements - if (!is.numeric(formatB) || length(formatB) != 2) { - stop("formatB must be a numeric vector with two elements.") - } - - # Convert Bcode to character and filter based on end position - b_data$Bcode <- as.character(b_data$Bcode) - b_data <- b_data[nchar(b_data$Bcode) == formatB[2], ] - - # Check if there are any valid records after end position filtering - if (nrow(b_data) == 0) { - stop("No valid records found in the B file after applying end position filter.") - } - } - - # Check uniqueness of B codes - duplicate_b_codes <- duplicated(b_data$Bcode) | duplicated(b_data$Bcode, fromLast = TRUE) - if (any(duplicate_b_codes)) { - first_duplicate <- b_data[duplicate_b_codes, "Bcode"][1] - stop(paste("Duplicate Bcode found in B file:", first_duplicate)) - } - - # Find unmatched source classification codes for B - unmatched_codes_B <- setdiff(b_data$Bcode, AB$Bcode) - noCorrespondenceB <- b_data[b_data$Bcode %in% unmatched_codes_B, ] - noClassificationB <- AB[AB$Bcode %in% unmatched_codes_B, ] - - # Print the length of noCorrespondenceB or a message indicating all codes in B are covered - if (nrow(noCorrespondenceB) > 0) { - cat("Number of unmatched source classification codes in B:", nrow(noCorrespondenceB), "\n") - } else { - cat("All source classification codes in B are covered in the correspondence table.\n") - } - - # Print the length of noClassificationB or a message indicating all codes in the correspondence table are covered - if (nrow(noClassificationB) > 0) { - cat("Number of source classification codes in AB not found in B:", nrow(noClassificationB), "\n") - } else { - cat("All source classification codes in the correspondence table are covered by B.\n") - } - } - - # Filter AB data based on formatA and formatB, if specified - if (!is.null(formatA) && !is.null(formatB)) { - AB$Acode <- as.character(AB$Acode) - AB$Bcode <- as.character(AB$Bcode) - AB <- AB[nchar(AB$Acode) == formatA & nchar(AB$Bcode) == formatB, ] - - # Check if there are any valid records after filtering - if (nrow(AB) == 0) { - stop("No valid records found in the AB file after applying formatA and formatB filters.") - } - } - #bipartitePart - # create the bipartite graph - g <- graph.data.frame(AB, directed = FALSE) - - # all composant - components <- decompose.graph(g) - - # list of composant by code - component_codes <- lapply(components, function(comp) V(comp)$name) - - AB$component <- NA - - for (i in seq_along(component_codes)) { - component <- component_codes[[i]] - AB$component[AB$Acode %in% component] <- paste("Component", i) - } - ### Print AB to see the component column for the correspondenceTable between Source & Target. - - component_codes <- unlist(component_codes) - - # Creation of the table for each component - component_index <- 0 - component_stats <- lapply(components, function(comp) { - component <- V(comp)$name - n_unique_targets <- length(unique(AB[AB$Acode %in% component, "Bcode"])) - - correspondence_type <- if (n_unique_targets == 1) { - if (length(unique(AB[AB$Bcode %in% component, "Acode"])) == 1) { - "1:1" - } else { - "M:1" - } - } else { - if (length(unique(AB[AB$Bcode %in% component, "Acode"])) == 1) { - "1:M" - } else { - "M:M" - } - } - - source_positions <- unique(AB[AB$Acode %in% component, "Acode"]) - target_positions <- unique(AB[AB$Acode %in% component, "Bcode"]) - n_source_positions <- length(source_positions) - n_target_positions <- length(target_positions) - - component_index <- component_index + 1 - component_name <- unique(AB[AB$Acode %in% component, "component"]) - - list( - Component = component_name, - CorrespondenceType = correspondence_type, - SourcePositions = source_positions, - TargetPositions = target_positions, - nSourcePositions = n_source_positions, - nTargetPositions = n_target_positions - ) - }) - - # Creation of the new dataFrame - result <- do.call(rbind, component_stats) - - # Conversion into a data frame - Annexe_A <- as.data.frame(result) - - ## Creation of Annex B - Annexe_B <- data.frame( - ClassC = AB$Acode, - ClassD = AB$Bcode, - nTargetClasses = NA, - SourceToTargetMapping = NA, - nSourceClasses = NA, - TargetToSourceMapping = NA - ) - # Update nTargetClasses column - Annexe_B$nTargetClasses <- sapply(Annexe_B$ClassC, function(c_code) { - length(unique(Annexe_B[Annexe_B$ClassC == c_code, "ClassD"])) - }) - - # Update SourceToTargetMapping column - Annexe_B$SourceToTargetMapping <- sapply(Annexe_B$ClassC, function(c_code) { - paste(unique(Annexe_B[Annexe_B$ClassC == c_code, "ClassD"]), collapse = ", ") - }) - - # Update nSourceClasses column - Annexe_B$nSourceClasses <- sapply(Annexe_B$ClassD, function(d_code) { - length(unique(Annexe_B[Annexe_B$ClassD == d_code, "ClassC"])) - }) - - # Update TargetToSourceMapping column - Annexe_B$TargetToSourceMapping <- sapply(Annexe_B$ClassD, function(d_code) { - paste(unique(Annexe_B[Annexe_B$ClassD == d_code, "ClassC"]), collapse = ", ") - }) - - colnames(Annexe_B)[1:2] = ColumnNames[1:2] - # store annex on variable to make table - output_annex_A <- Annexe_A - output_annex_B <- Annexe_B - - annex_A_df <- as.data.frame(output_annex_A) - annex_A_df$Component <- as.character(annex_A_df$Component) - annex_A_df$CorrespondenceType <- as.character(annex_A_df$CorrespondenceType) - annex_A_df$SourcePositions <- as.character(annex_A_df$SourcePositions) - annex_A_df$TargetPositions <- as.character(annex_A_df$TargetPositions) - annex_A_df$nSourcePositions <- as.numeric(annex_A_df$nSourcePositions) - annex_A_df$nTargetPositions <- as.numeric(annex_A_df$nTargetPositions) - annex_B_df <- as.data.frame(output_annex_B) - - # Take the user's CSV file name to create CSV files - if (!is.null(input_file_path)) { - base_file_name <- tools::file_path_sans_ext(tools::file_path_sans_ext(basename(input_file_path))) - } else { - # Generate a unique base file name (e.g., based on the timestamp) - base_file_name <- paste0("correspondence_analysis_", format(Sys.time(), "%Y%m%d%H%M%S")) - } - - - if (!is.null(CSVcorrespondenceInventory)) { - if (is.character(CSVcorrespondenceInventory)) { - chemin_inventaire <- CSVcorrespondenceInventory - } else { - # Generate a file name based on the name of the first column, "correspondence", and the date - chemin_inventaire <- paste0("Correspondence_inventory_",ColumnNames[1], "_", ColumnNames[2], ".csv") - } - write.csv(annex_A_df, chemin_inventaire, row.names = FALSE) - message(paste0("The table was saved in ", getwd(), chemin_inventaire)) - } - - if (!is.null(CSVcorrespondenceAnalysis)) { - if (is.character(CSVcorrespondenceAnalysis)) { - # If it's a valid file path, use it - chemin_analyse <- CSVcorrespondenceAnalysis - } else { - # Generate a file name based on the name of the first column, "correspondence", and the date - chemin_analyse <- paste0("Correspondence_analysis_", ColumnNames[1], "_", ColumnNames[2], ".csv") - } - write.csv(annex_B_df, chemin_analyse, row.names = FALSE) - message(paste0("The table was saved in ", getwd(), chemin_analyse)) - } - - # Output list of the two dataframes. - output <- list(Annexe_A = output_annex_A, Annexe_B = output_annex_B) - - return(output) - -} - - - - +#' @title analyseCorrespondenceTable performs analysis on correspondence tables +#' @description The `analyseCorrespondenceTable` function takes input correspondence tables (AB) and related data (A and B) to perform analysis and generate various statistics. +#' It checks the validity of the input data, identifies components, calculates correspondence types, and creates summary tables. +#' @param AB a mandatory argument containing a CSV file provide by the user contains the correspondence table data with columns "Acode" and "Bcode". +#' @param A a path to a CSV file containing source classification data with "Acode" column. +#' @param longestAcodeOnly A Boolean argument to filter source classification data based on "Acode" retaining only the maximum length, thus the lowest level Acode . +#' @param B a path to a CSV file containing target classification data with "Bcode" column. +#' @param longestBcodeOnly A Boolean argument to filter source classification data based on "Bcode" retaining only the maximum length, thus the lowest level Bcode. +#' @param CSVcorrespondenceInventory The valid values are not NULL if the user put a path with a empty csv file it will return it with the correspondeceInventory or just a path with a csv file . By default no CSV is produce +#' @param CSVcorrespondenceAnalysis Provide an output containing the correpondenceAnalysis. the user put a path a empty file it will return with correpondenceAnalysis. by default no CSV is produce +#' @importFrom igraph graph.data.frame decompose.graph +#' @import igraph +#' +#' @return A list containing two data frames: Inventory and Analysis. +#' The `CSVcorrespondenceInventory` contains statistics related to components, correspondence types, and source/target positions. +#' The `CSVcorrespondenceAnalysis` contains statistics for each class in the correspondence table. +#' +#' @export +#' @examples +#' # Use data from the folder extdata +#' +#' +#' +#' +#' +#' +#' +#' +#' # Perform analysis +#' result <- analyseCorrespondenceTable(AB =system.file("extdata", "ExempleAnnexe.csv", package = "correspondenceTables"),A = NULL, longestAcodeOnly = FALSE, B = NULL, longestBcodeOnly = FALSE, CSVcorrespondenceInventory = NULL, CSVcorrespondenceAnalysis = NULL) +#' print(result$Inventory) +#' print(result$Analysis) + + + +analyseCorrespondenceTable <- function(AB, A = NULL, longestAcodeOnly = FALSE, B = NULL, longestBcodeOnly = FALSE, + CSVcorrespondenceInventory = NULL, CSVcorrespondenceAnalysis = NULL) { + + # if (class(AB) == "character") { + # # If AB is a character string, assume it's a path to a CSV file + # input_file_path <- AB + # + # AB <- read.csv2(input_file_path, header = TRUE, sep = ",") + # } else if (class(AB) == "data.frame") {} else { + # stop("Parameter AB must be a path to a CSV file") + # } + + ab_data <- testInputTable("Correspondence table (AB)", AB) + #ab_data[] <- lapply(ab_data, as.character) + + #Check if required number of columns are present in input + #check_n_columns(ab_data,"Correspondence table (AB)", 2) + + ColumnNames_ab <- colnames(ab_data)[1:2] + colnames(ab_data)[1:2] = c("Acode", "Bcode") + unused_data_ab <- ab_data + + # # Check if AB file has required columns + # if (!("Acode" %in% colnames(AB)) || !("Bcode" %in% colnames(AB))) { + # stop("The AB file must contain 'Acode' and 'Bcode' columns.") + # } + # # Filter out records with missing Acode or Bcode + # AB <- AB[complete.cases(AB[c("Acode", "Bcode")]), ] + + # Check if there are any records + tryCatch( + { + if (nrow(ab_data) == 0) { + stop("No valid records found in the input correspondence table AB.") + } + }, error = function(e) { + message("Error in analyseCorrespondenceTable: ",conditionMessage(e)) + }) + + # Filter rows where Acode or Bcode is missing in the AB data + missing_code_rows <- ab_data[is.na(ab_data$Acode) | ab_data$Acode == "" | is.na(ab_data$Bcode) | ab_data$Bcode == "", ] + tryCatch( + { + # Display problematic rows + if (nrow(missing_code_rows) > 0) { + stop(paste("Rows with missing values in the", ColumnNames_ab[1], "or", ColumnNames_ab[2], "column of the AB data:")) + } + }, error = function(e) { + message("Error in analyseCorrespondenceTable: ",conditionMessage(e)) + print(missing_code_rows) + }) + + # # Check if there are any valid records after filtering + # if (nrow(AB) == 0) { + # stop("No valid records found in the AB file.") + # } + + # Find duplicated combinations of Acode and Bcode in AB + duplicate_pairs <- ab_data[duplicated(ab_data[c("Acode", "Bcode")]), c("Acode", "Bcode")] + tryCatch( + { + # Check for duplicate combinations of Acode and Bcode + if (nrow(duplicate_pairs) > 0) { + stop("Please remove duplicate(s) combinations of Acode and Bcode found in AB file.") + } + }, error = function(e) { + message("Error in analyseCorrespondenceTable:",conditionMessage(e)) + print(duplicate_pairs) + stop(e) + }) + + # Filter AB data based on longestAcodeOnly and longestBcodeOnly, if specified + + if (longestAcodeOnly == TRUE | longestBcodeOnly == TRUE) { + if (longestAcodeOnly == TRUE) { + # Calculate the maximum length of Acode + maxLengthA <- max(nchar(ab_data$Acode, type = "width")) + # Filter rows where Acode has the maximum length + longest_Acode <- ab_data$Acode[nchar(ab_data$Acode, type = "width") == maxLengthA ] + if (length(longest_Acode) == nrow(ab_data)) { + ab_data$Acode <- longest_Acode + } else { + ab_data$Acode <- c(longest_Acode, rep("", nrow(ab_data) - length(longest_Acode))) + } + } + if (longestBcodeOnly == TRUE){ + # Calculate the maximum length of Bcode + maxLengthB <- max(nchar(ab_data$Bcode, type = "width")) + # Filter rows where Bcode has the maximum length + longest_Bcode <- ab_data$Bcode[nchar(ab_data$Bcode, type = "width") == maxLengthB ] + if (length(longest_Bcode) == nrow(ab_data)) { + ab_data$Bcode <- longest_Bcode + } else { + ab_data$Bcode <- c(longest_Bcode, rep("", nrow(ab_data) - length(longest_Bcode))) + } + } + + # Check if there are any valid records after filtering + if (nrow(ab_data) == 0) { + stop("No valid records found in the AB file after applying longestAcodeOnly and/or longestBcodeOnly filters.") + } + if (length(ab_data$Acode) != length(ab_data$Bcode)) { + stop("Invalid records found in the AB file after applying the longestAcodeOnly and/or longestBcodeOnly filters. Acode and Bcode have different number of rows") + } + } + + # Read A file if provided + if (!is.null(A)) { + # a_data <- read.csv(A, header = TRUE) + + a_data <- testInputTable("Source classification table (A)", A) + + ColumnNames_a <- colnames(A)[1:1] + colnames(a_data)[1:1] = c("Acode") + unused_data_a <- a_data + + # Check if there are any records + tryCatch( + { + if (nrow(a_data) == 0) { + stop("No valid records found in the input table ource classification table (A).") + } + }, error = function(e) { + message("Error in analyseCorrespondenceTable: ",conditionMessage(e)) + }) + + # Check uniqueness of A codes + duplicate_a_codes <- duplicated(a_data$Acode) | duplicated(a_data$Acode, fromLast = TRUE) + tryCatch( + { + if (nrow(duplicate_a_codes) > 0) { + stop("Duplicate Acode(s) found in A file.") + } + },error = function(e) { + message("Error in analyseCorrespondenceTable: ",conditionMessage(e)) + print(duplicate_a_codes) + }) + + # Filter out A records based on longestAcodeOnly, if specified + if (longestAcodeOnly == TRUE) { + # Calculate the maximum length of Acode + maxLengthA <- max(nchar(a_data$Acode, type = "width")) + + # Filter rows where Acode has the maximum length + longest_Acode <- a_data$Acode[nchar(a_data$Acode, type = "width") == maxLengthA ] + if (length(longest_Acode) == nrow(a_data)) { + a_data$Acode <- longest_Acode + } else { + a_data$Acode <- c(longest_Acode, rep("", nrow(a_data) - length(longest_Acode))) + } + + # Check if there are any valid records after end position filtering + tryCatch( + { + if (nrow(a_data) == 0) { + stop("No valid records found in the A file after applying end position filter.") + } + },error = function(e) { + message("Error in analyseCorrespondenceTable: ",conditionMessage(e)) + stop(e) + }) + } + + # Find unmatched source classification codes for A + unmatched_codes_A <- setdiff(a_data$Acode, ab_data$Acode) + noCorrespondenceA <- a_data[a_data$Acode %in% unmatched_codes_A, ] + noClassificationA <- ab_data[ab_data$Acode %in% unmatched_codes_A, ] + + # Check if all codes in A are covered + if (nrow(noCorrespondenceA) > 0) { + message("Warning:Number of unmatched source classification codes in A:\n") + print(noCorrespondenceA) + } + + # Check if all codes in the correspondence table are covered + if (nrow(noClassificationA) > 0) { + message("Warning:Number of source classification codes in AB not found in A:\n") + print(noClassificationA) + } + } + + # Read B file if provided + if (!is.null(B)) { + + #b_data <- read.csv(B, header = TRUE) + b_data <- testInputTable("Target classification table (B)", B) + ColumnNames_b <- colnames(B)[1:1] + colnames(b_data)[1:1] = c("Bcode") + unused_data_b <- b_data + + # Check if there are any records + tryCatch( + { + if (nrow(b_data) == 0) { + stop("No valid records found in the input table ource classification table (B).") + } + }, error = function(e) { + message("Error in analyseCorrespondenceTable: ",conditionMessage(e)) + }) + + # Check uniqueness of B codes + duplicate_b_codes <- duplicated(b_data$Bcode) | duplicated(b_data$Bcode, fromLast = TRUE) + tryCatch( + { + if (nrow(duplicate_b_codes) > 0) { + stop("Duplicate Bcode(s) found in B file.") + } + },error = function(e) { + message("Error in analyseCorrespondenceTable: ",conditionMessage(e)) + print(duplicate_b_codes) + }) + + # Filter out B records based on longestBcodeOnly, if specified + if (longestBcodeOnly == TRUE) { + # Calculate the maximum length of Bcode + maxLengthB <- max(nchar(b_data$Bcode, type = "width")) + + # Filter rows where Bcode has the maximum length + longest_Bcode <- b_data$Bcode[nchar(b_data$Bcode, type = "width") == maxLengthB ] + if (length(longest_Bcode) == nrow(b_data)) { + b_data$Bcode <- longest_Bcode + } else { + b_data$Bcode <- c(longest_Bcode, rep("", nrow(b_data) - length(longest_Bcode))) + } + + # Check if there are any valid records after end position filtering + tryCatch( + { + if (nrow(b_data) == 0) { + stop("No valid records found in the B file after applying end position filter.") + } + },error = function(e) { + message("Error in analyseCorrespondenceTable: ",conditionMessage(e)) + stop(e) + }) + } + + # Find unmatched source classification codes for B + unmatched_codes_B <- setdiff(b_data$Bcode, ab_data$Bcode) + noCorrespondenceB <- b_data[b_data$Bcode %in% unmatched_codes_B, ] + noClassificationB <- ab_data[ab_data$Bcode %in% unmatched_codes_B, ] + + ## Check if all codes in B are covered + if (nrow(noCorrespondenceB) > 0) { + message("Warning:Number of unmatched source classification codes in B:\n") + print(noCorrespondenceB) + } + + # Check if all codes in the correspondence table are covered + if (nrow(noClassificationB) > 0) { + message("Warning: Number of source classification codes in AB not found in B:\n") + print(noClassificationB) + } + } + + + + #bipartitePart + # create the bipartite graph + g <- graph.data.frame(ab_data, directed = FALSE) + + # all composant + components <- decompose.graph(g) + + # list of composant by code + component_codes <- lapply(components, function(comp) V(comp)$name) + + ab_data$component <- NA + + for (i in seq_along(component_codes)) { + component <- component_codes[[i]] + ab_data$component[ab_data$Acode %in% component] <- paste("Component", i) + } + ### Print AB to see the component column for the correspondenceTable between Source & Target. + + component_codes <- unlist(component_codes) + + # Creation of the table for each component + component_index <- 0 + component_stats <- lapply(components, function(comp) { + component <- V(comp)$name + n_unique_targets <- length(unique(ab_data[ab_data$Acode %in% component, "Bcode"])) + + correspondence_type <- if (n_unique_targets == 1) { + if (length(unique(ab_data[ab_data$Bcode %in% component, "Acode"])) == 1) { + "1:1" + } else { + "M:1" + } + } else { + if (length(unique(ab_data[ab_data$Bcode %in% component, "Acode"])) == 1) { + "1:M" + } else { + "M:M" + } + } + + source_positions <- unique(ab_data[ab_data$Acode %in% component, "Acode"]) + target_positions <- unique(ab_data[ab_data$Acode %in% component, "Bcode"]) + n_source_positions <- length(source_positions) + n_target_positions <- length(target_positions) + + component_index <- component_index + 1 + component_name <- unique(ab_data[ab_data$Acode %in% component, "component"]) + + list( + Component = component_name, + CorrespondenceType = correspondence_type, + SourcePositions = source_positions, + TargetPositions = target_positions, + nSourcePositions = n_source_positions, + nTargetPositions = n_target_positions + ) + }) + + # Creation of the new dataFrame + result <- do.call(rbind, component_stats) + + # Conversion into a data frame + Inventory <- as.data.frame(result) + + ## Creation of Annex B (Analysis) + Analysis <- data.frame( + ClassC = ab_data$Acode, + ClassD = ab_data$Bcode, + nTargetClasses = NA, + SourceToTargetMapping = NA, + nSourceClasses = NA, + TargetToSourceMapping = NA + ) + # Update nTargetClasses column + Analysis$nTargetClasses <- sapply(Analysis$ClassC, function(c_code) { + length(unique(Analysis[Analysis$ClassC == c_code, "ClassD"])) + }) + + # Update SourceToTargetMapping column + Analysis$SourceToTargetMapping <- sapply(Analysis$ClassC, function(c_code) { + paste(unique(Analysis[Analysis$ClassC == c_code, "ClassD"]), collapse = ", ") + }) + + # Update nSourceClasses column + Analysis$nSourceClasses <- sapply(Analysis$ClassD, function(d_code) { + length(unique(Analysis[Analysis$ClassD == d_code, "ClassC"])) + }) + + # Update TargetToSourceMapping column + Analysis$TargetToSourceMapping <- sapply(Analysis$ClassD, function(d_code) { + paste(unique(Analysis[Analysis$ClassD == d_code, "ClassC"]), collapse = ", ") + }) + + + + colnames(Analysis)[1:2] = c("Acode", "Bcode") + # store annex on variable to make table + output_Inventory <- Inventory + output_Analysis <- Analysis + + + Inventory_df <- as.data.frame(output_Inventory) + + Inventory_df$Component <- as.character(Inventory_df$Component) + Inventory_df$CorrespondenceType <- as.character(Inventory_df$CorrespondenceType) + #Inventory_df$SourcePositions <- as.character(Inventory_df$SourcePositions) + Inventory_df$SourcePositions <- sapply(Inventory_df$SourcePositions, function(x) paste(x, collapse = ", ")) + #Inventory_df$TargetPositions <- as.character(Inventory_df$TargetPositions) + Inventory_df$TargetPositions <- sapply(Inventory_df$TargetPositions, function(x) paste(x, collapse = ", ")) + Inventory_df$nSourcePositions <- as.numeric(Inventory_df$nSourcePositions) + Inventory_df$nTargetPositions <- as.numeric(Inventory_df$nTargetPositions) + + Analysis_df <- as.data.frame(output_Analysis) + Analysis_df <- merge(Analysis_df, unused_data_ab, by = c("Acode", "Bcode"), all = F) + + if (!is.null(B)) { + Analysis_df <- merge(Analysis_df, unused_data_b, by = "Bcode", all = F) + } + + if (!is.null(A)) { + Analysis_df <- merge(Analysis_df, unused_data_a, by = "Acode", all = F) + } + + Analysis_df <- Analysis_df[, c("Acode", "Bcode", setdiff(names(Analysis_df), c("Acode", "Bcode")))] + + colnames(Analysis_df)[1:2] = ColumnNames_ab[1:2] + base_file_name <- paste0("correspondence_analysis_", format(Sys.time(), "%Y%m%d%H%M%S")) + + CsvFileSave(CSVcorrespondenceInventory, Inventory_df) + CsvFileSave(CSVcorrespondenceAnalysis, Analysis_df) + + # Output list of the two dataframes. + output <- list(Inventory = Inventory_df, Analysis = Analysis_df) + + return(output) +} + + + + diff --git a/R/newCorrespondenceTable.R b/R/newCorrespondenceTable.R index 7ec3f1f..e02ceff 100644 --- a/R/newCorrespondenceTable.R +++ b/R/newCorrespondenceTable.R @@ -1,2033 +1,2033 @@ -#' @title Ex novo creation of candidate correspondence tables between two classifications via pivot tables -#' @description Creation of a candidate correspondence table between two classifications, A and B, when there are -#' correspondence tables leading from the first classification to the second one via \eqn{k} intermediate pivot -#' classifications \eqn{C_1, \ldots, C_k}. -#' The correspondence tables leading from A to B are A:\eqn{C_1}, \{\eqn{C_i}:\eqn{C_{i+1}}: \eqn{1 \le i \le k -1}\}, B:\eqn{C_k}. -#' @param Tables A string of type character containing the name of a csv file which contains the names of the files that -#' contain the classifications and the intermediate correspondence tables (see "Details" below). -#' @param CSVout The preferred name for the \emph{output csv files} that will contain the candidate correspondence table -#' and information about the classifications involved. The valid values are \code{NULL} or strings of type \code{character}. -#' If the selected value is \code{NULL}, the default, no output file is produced. If the value is a string, then the output -#' is exported into two csv files whose names contain the provided name (see "Value" below). -#' @param Reference The reference classification among A and B. If a classification is the reference to the other, and hence -#' \emph{hierarchically superior} to it, each code of the other classification is expected to be mapped to at most one code -#' of the reference classification. The valid values are \code{"none"}, \code{"A"}, and \code{"B"}. If the selected value -#' is \code{"A"} or \code{"B"}, a "Review" flag column (indicating the records violating this expectation) is included -#' in the output (see "Explanation of the flags" below). -#' @param MismatchTolerance The maximum acceptable proportion of rows in the candidate correspondence table which contain -#' no code for classification A or no code for classification B. The default value is \code{0.2}. The valid values are -#' real numbers in the interval [0, 1]. -#' @param Redundancy_trim An argument in the function containing the logical values \code{TRUE} or \code{FALSE} -#' used to facilitate the trimming of the redundant records. -#' The default value is \code{TRUE}, which removes all redundant records. -#' The other values is \code{FALSE}, which shows redundant records together with the redundancy flag. -#' @export -#' @details -#' File and file name requirements: -#' \itemize{ -#' \item The file that corresponds to argument \code{Tables} and the files to which the contents of \code{Tables} -#' lead, must be in \emph{csv format with comma as delimiter}. If full paths are not provided, then these files must -#' be available in the working directory. No two filenames provided must be identical. -#' \item The file that corresponds to argument \code{Tables} must contain filenames, \emph{and nothing else}, in -#' a \eqn{(k+2)} × \eqn{(k+2)} table, where \eqn{k}, a positive integer, is the number of "pivot" classifications. -#' The cells in the main diagonal of the table provide the filenames of the files which contain, with this order, -#' the classifications A, \eqn{C_1}, \eqn{\ldots}, \eqn{C_k} and B. The off-diagonal directly above the main -#' diagonal contains the filenames of the files that contain, with this order, the correspondence tables -#' A:\eqn{C_1}, \{\eqn{C_i}:\eqn{C_{i+1}}, \eqn{1 \le i \le k-1}\} and B:\eqn{C_k}. All other cells of the table -#' must be empty. -#' \item If any of the two files where the output will be stored is read protected (for instance because it is open -#' elsewhere) an error message will be reported and execution will be halted. -#' } -#' Classification table requirements: -#' \itemize{ -#' \item Each of the files that contain classifications must contain at least one column and at least two rows. -#' The first column contains the codes of the respective classification. The first row contains column headers. -#' The header of the first column is the name of the respective classification (e.g., "CN 2021"). -#' \item The classification codes contained in a classification file (expected in its first column as mentioned -#' above) must be unique. No two identical codes are allowed in the column. -#' \item If any of the files that contain classifications has additional columns the first one of them is assumed -#' to contain the labels of the respective classification codes. -#' } -#' Correspondence table requirements: -#' \itemize{ -#' \item The files that contain correspondence tables must contain at least two columns and at least two rows. -#' The first column of the file that contains A:\eqn{C_1} contains the codes of classification A. The second column -#' contains the codes of classification \eqn{C_1}. Similar requirements apply to the files that contain -#' \eqn{C_i}:\eqn{C_{i+1}}, \eqn{1 \le i \le k-1} and B:\eqn{C_k}. The first row of each of the files that contain -#' correspondence tables contains column headers. The names of the first two columns are the names of the respective -#' classifications. -#' \item The pairs of classification codes contained in a correspondence table file (expected in its first two columns -#' as mentioned above) must be unique. No two identical pairs of codes are allowed in the first two columns. -#' } -#' Interdependency requirements: -#' \itemize{ -#' \item At least one code of classification A must appear in both the file of classification A and the file of -#' correspondence table A:\eqn{C_1}. -#' \item At least one code of classification B must appear in both the file of classification B and the file of -#' correspondence table B:\eqn{C_k}, where \eqn{k}, \eqn{k\ge 1}, is the number of pivot classifications. -#' \item If there is only one pivot classification, \eqn{C_1}, at least one code of it must appear in both the file of -#' correspondence table A:\eqn{C_1} and the file of correspondence table B:\eqn{C_1}. -#' \item If the pivot classifications are \eqn{k} with \eqn{k\ge 2} then at least one code of \eqn{C_1} must appear in -#' both the file of correspondence table A:\eqn{C_1} and the file of correspondence table \eqn{C_1}:\eqn{C_2}, at least one -#' code of each of the \eqn{C_i}, \eqn{i = 2, \ldots, k-1} (if \eqn{k\ge 3}) must appear in both the file of correspondence -#' table \eqn{C_{i-1}}:\eqn{C_i} and the file of correspondence table \eqn{C_i}:\eqn{C_{i+1}}, and at least one code of -#' \eqn{C_k} must appear in both the file of correspondence table \eqn{C_{k-1}}:\eqn{C_k} and the file of correspondence table -#' B:\eqn{C_k}. -#' } -#' Mismatch tolerance: -#' \itemize{ -#' \item The ratio that is compared with \code{MismatchTolerance} has as numerator the number of rows in the candidate -#' correspondence table which contain no code for classification A or no code for classification B and as denominator -#' the total number of rows of this table. If the ratio exceeds \code{MismatchTolerance} the execution of the function -#' is halted. -#' } -#' If any of the conditions required from the arguments is violated an error message is produced and execution is stopped. -#' -#' @section Explanation of the flags: -#' -#' \itemize{ -#' \item The "Review" flag is produced only if argument Reference has been set equal to "\code{A}" or "\code{B}". For -#' each row of the candidate correspondence table, if \code{Reference} = "\code{A}" the value of "Review" is equal to -#' \code{1} if the code of B maps to more than one code of A, and \code{0} otherwise. If \code{Reference} = "\code{B}" -#' the value of "Review" is equal to \code{1} if the code of A maps to more than one code of B, and \code{0} otherwise. -#' The value of the flag is empty if the row does not contain a code of A or a code of B. -#' \item For each row of the candidate correspondence table, the value of "Redundancy" is equal to \code{1} if the row -#' contains a combination of codes of A and B that also appears in at least one other row of the candidate -#' correspondence table. -#' \item When "Redundancy_Trim" is equal to \code{FALSE} the "Redundancy_keep" flag is created to identify with value \code{1} -#' the records that will be kept if trimming is performed. -#' \item For each row of the candidate correspondence table, the value of "Unmatched" is equal to \code{1} if the row -#' contains a code of A but no code of B or if it contains a code of B but no code of A. The value of the flag is -#' \code{0} if the row contains codes for both A and B. -#' \item For each row of the candidate correspondence table, the value of "NoMatchFromA" is equal to \code{1} if the row -#' contains a code of A that appears in the table of classification A but not in correspondence table A:\eqn{C_1}. The -#' value of the flag is \code{0} if the row contains a code of A that appears in both the table of classification A and -#' correspondencetable A:\eqn{C_1}. Finally, the value of the flag is empty if the row contains no code of A or if it -#' contains a code of A that appears in correspondence table A:\eqn{C_1} but not in the table of classification A. -#' \item For each row of the candidate correspondence table, the value of "NoMatchFromB" is equal to \code{1} if the row -#' contains a code of B that appears in the table of classification B but not in correspondence table B:\eqn{C_k}. The -#' value of the flag is \code{0} if the row contains a code of B that appears in both the table of classification B and -#' correspondence table B:\eqn{C_k}. Finally, the value of the flag is empty if the row contains no code of B or if it -#' contains a code of B that appears in correspondence table B:\eqn{C_k} but not in the table of classification B. -#' \item The argument "Redundancy_trim" is used to delete all the redundancies which are mapping correctly. -#' The valid logical values for this argument in the candidate correspondence table are \code{TRUE} or \code{FALSE}. -#' If the selected value is \code{TRUE}, all redundant records are removed and kept exactly one record for each unique combination. -#' For this retained record, the codes, the label and the supplementary information of the pivot classifications are replaced with -#' 'multiple'. If the multiple infomration of the pivot classifications are the same, their value will not be replaced. -#' If the selected value is \code{FALSE}, no trimming is executed so redundant records are shown, together with the redundancy flag. -#' If the logical values are missing the implementation of the function will stop. -#' -#' } -#' -#' @section Sample datasets included in the package: -#' -#' Running \code{browseVignettes("correspondenceTables")} in the console opens an html page in the user's default browser. Selecting HTML from the menu, users can read information about the use of the sample datasets that are included in the package. -#' If they wish to access the csv files with the sample data, users have two options: -#' \itemize{ -#' \item Option 1: Unpack into any folder of their choice the tar.gz file into which the package has arrived. All sample -#' datasets may be found in the "inst/extdata" subfolder of this folder. -#' \item Option 2: Go to the "extdata" subfolder of the folder in which the package has been installed in their PC's \code{R} -#' library. All sample datasets may be found there. -#' } -#' -#' @return -#' \code{newCorrespondenceTable()} returns a list with two elements, both of which are data frames. -#' \itemize{ -#' \item The first element is the candidate correspondence table A:B, including the codes of all "pivot" classifications, -#' augmented with flags "Review" (if applicable), "Redundancy", "Unmatched", "NoMatchFromA", "NoMatchFromB" and with all -#' the additional columns of the classification and intermediate correspondence table files. -#' \item The second element contains the names of classification A, the "pivot" classifications and classification B as -#' read from the top left-hand side cell of the respective input files. -#' \item If the value of argument \code{CSVout} a string of type \code{character}, the elements of the list are exported -#' into files of csv format. The name of the file for the first element is the value of argument \code{CSVout} and the -#' name of the file for the second element is classificationNames_\code{CSVout}. For example, if -#' \code{CSVout} = "newCorrespondenceTable.csv", the elements of the list are exported into "newCorrespondenceTable.csv" -#' and "classificationNames_newCorrespondenceTable.csv" respectively. -#' } -#' -#' @examples -#' { -#' ## Application of function newCorrespondenceTable() with "example.csv" being the file -#' ## that includes the names the files and the intermediate tables in a sparse square -#' ## matrix containing the 100 rows of the classifications (from ISIC v4 to CPA v2.1 through -#' ## CPC v2.1). The desired name for the csv file that will contain the candidate -#' ## correspondence table is "newCorrespondenceTable.csv", the reference classification is -#' ## ISIC v4 ("A") and the maximum acceptable proportion of unmatched codes between -#' ## ISIC v4 and CPC v2.1 is 0.56 (this is the minimum mismatch tolerance for the first 100 row -#' ## as 55.5% of the code of ISIC v4 is unmatched). -#' -#' tmp_dir<-tempdir() -#' A <- read.csv(system.file("extdata", "example.csv", package = "correspondenceTables"), -#' header = FALSE, -#' sep = ",") -#' for (i in 1:nrow(A)) { -#' for (j in 1:ncol(A)) { -#' if (A[i,j]!="") { -#' A[i, j] <- system.file("extdata", A[i, j], package = "correspondenceTables") -#' }}} -#' write.table(x = A, -#' file = file.path(tmp_dir,"example.csv"), -#' row.names = FALSE, -#' col.names = FALSE, -#' sep = ",") -#' -#' NCT<-newCorrespondenceTable(file.path(tmp_dir,"example.csv"), -#' file.path(tmp_dir,"newCorrespondenceTable.csv"), -#' "A", -#' 0.56, -#' FALSE) -#' -#' summary(NCT) -#' head(NCT$newCorrespondenceTable) -#' NCT$classificationNames -#' csv_files<-list.files(tmp_dir, pattern = ".csv") -#' unlink(csv_files) -#' } - -newCorrespondenceTable <- function(Tables, CSVout = NULL, Reference = "none", MismatchTolerance = 0.2, Redundancy_trim = TRUE) { - - # Check if the file that contains the names of both classifications and - # correspondence tables exists in working directory - if (!file.exists(Tables)) { - stop(simpleError(paste("There is no file with name", Tables, "in your working directory."))) - } else { - # x <- as.matrix(utils::read.csv(Tables, sep = ",", header = FALSE, colClasses = c("character"), - # encoding = "UTF-8")) - x <- as.matrix(data.table::fread(Tables, sep = ",", header = FALSE, colClasses = c("character"), - encoding = "UTF-8")) - mat.list <- apply(x, 2, function(x) { - as.character(which(x != "")) - }) - } - - # Check if files exist in working directory - test.names <- as.vector(x)[which(as.vector(x) != "")] - if (!all(file.exists(test.names))) { - for (i in which(file.exists(test.names) == FALSE)) { - stop(simpleError(paste("The is no file with name", test.names[i], "in your working directory."))) - } - } - - if (length(which(duplicated(test.names) == TRUE)) >= 1) { - stop(simpleError(paste("At least two of the filenames in", Tables, "are the same."))) - } - - # Check CSVout - if (!is.null(CSVout)) { - while (file.exists(CSVout)) { - message(paste("Your working directory contains already a file with the name that you selected for the output file: ", - CSVout)) - answer <- utils::menu(c("Yes", "No"), title = "Do you want to overwrite it?") - if (answer == 2) { - CSVout <- readline(prompt = "Please enter a new name for the output file: ") - } - if (answer == 1) { - break - } - } - } - - - # Check Reference - if (!(Reference %in% c("A", "B", "none"))) { - stop(simpleError("You entered a non-allowed value for Reference. The allowed values are \"A\", \"B\" and \"none\".")) - } - - # Check MismatchTolerance - if (is.character(MismatchTolerance) || MismatchTolerance < 0 || MismatchTolerance > - 1) { - stop(simpleError("You entered a non-allowed value for MismatchTolerance. The allowed values are numbers in the interval [0, 1].")) - } - - - test.list <- list() - test.list[[1]] <- "1" - for (mat.index in 2:ncol(x)) { - test.list[[mat.index]] <- as.character(c((mat.index - 1):mat.index)) - } - - # The following if statement checks if the names of both classifications - # and correspondence tables in the 'names.csv' file construct a sparse - # square matrix. - if (all(unlist(Map(identical, mat.list, test.list)) == TRUE) && nrow(x) >= 3) { - k <- nrow(x) - 2 - - } else { - # Error message in case the names of both classifications and - # correspondence tables in the 'names.csv' file do not construct a - # sparse square matrix. - stop(paste("The filenames in", Tables, "do not construct a sparse square matrix. \n Please verify that the appropriate number of filenames are inserted in the appropriate cells.")) - } - - # The list inputs includes the names of both classifications and - # correspondence tables. - inputs <- list() - inputs[[1]] <- diag(x)[1] - inputs[seq(k) + 1] = as.list(diag(x)[seq(k) + 1]) - inputs[[k + 2]] <- diag(x)[length(diag(x))] - inputs[(k + 3):(k + 2 + length(as.list(x[upper.tri(x)][x[upper.tri(x)] != ""])))] <- as.list(x[upper.tri(x)][x[upper.tri(x)] != - ""]) - - # Create a list of the classifications and the known correspondence tables - # as data frames. - RRR <- lapply(inputs[1:length(inputs)], function(x) { - utils::read.csv(x, sep = ",", check.names = FALSE, colClasses = c("character"), - encoding = "UTF-8") - # data.table::fread(x, sep = ",", check.names = FALSE, colClasses = c("character"), - # encoding = "UTF-8") - }) - - removeBOM <- function(headers) { - gsub("\\xef\\xbb\\xbf", "", headers, useBytes = T) - } - - for (i in 1:length(RRR)) { - colnames(RRR[[i]]) <- removeBOM(colnames(RRR[[i]])) - } - - # Convert data frames into matrices. - RR <- lapply(RRR, function(x) { - matrix(unlist(x), ncol = ncol(x)) - }) - - # Select the correspondence tables. - R <- RR[utils::tail(c(1:length(RR)), (length(RR) - 1)/2)] - - # Check the dimensions of the files - for (i in 1:nrow(x)) { - if (ncol(RRR[[i]]) < 1 || nrow(RRR[[i]]) < 1) { - stop(simpleError(paste("File", inputs[i], "should have at least one column and two rows (including the row of headers)."))) - } - } - - for (i in 1:length(R)) { - if (ncol(R[[i]]) <= 1 || nrow(R[[i]]) < 1) { - stop(simpleError(paste("File", inputs[i + nrow(x)], "should have at least two columns and two rows (including the row of headers)."))) - } - } - - # Check for entries dimensions of the files - for (i in 1:nrow(x)) { - if (sum(duplicated(RRR[[i]][, 1])) >= 1) { - stop(simpleError(paste("At least one code of ", colnames(RRR[[i]])[1], - " appears more than once in file ", inputs[i], ". This is an error. Each code must appear only once in the file.", - sep = ""))) - } - } - - for (i in 1:length(R)) { - if (nrow(unique(R[[i]][, 1:2])) != nrow(R[[i]][, 1:2])) { - stop(simpleError(paste("At least one pair of codes of ", colnames(RRR[[i + - nrow(x)]])[1], " and ", colnames(RRR[[i + nrow(x)]])[2], " appears more than once in file ", - inputs[i + nrow(x)], ". This is an error. Each pair of codes must appear only once in the file.", - sep = ""))) - } - } - - # Check for at least one match in classifications and correspondence - # tables. In inputs there are the names of both classifications and - # correspondence tables. Stop with error - if (k == 1) { - # A in A appears in A:C1 - if (sum(!is.na(match(unlist(RRR[[1]][, 1]), R[[1]][, 1]))) == 0) { - stop(simpleError(paste("There is no code of ", colnames(RRR[[1]])[1], - " that appears in both ", inputs[1], " and ", inputs[1 + nrow(x)], - ". This is an error. The files should have at least one code of ", - colnames(RRR[[1]])[1], " in common to allow the generation of the candidate correspondence table.", - sep = ""))) - } - - # C1 in A:C1 appears in B:C1 - if (sum(!is.na(match(R[[1]][, 2], R[[2]][, 2]))) == 0) { - stop(simpleError(paste("There is no code of ", colnames(RRR[[1 + nrow(x)]])[2], - " that appears in both ", inputs[1 + nrow(x)], " and ", inputs[2 + - nrow(x)], ". This is an error. The files should have at least one code of ", - colnames(RRR[[1 + nrow(x)]])[2], " in common to allow the generation of the candidate correspondence table.", - sep = ""))) - } - - # B in B:C1 appears in B - if (sum(!is.na(match(R[[length(R)]][, 1], unlist(RRR[[nrow(x)]][, 1])))) == 0) { - stop(simpleError(paste("There is no code of ", colnames(RRR[[length(R) + - nrow(x)]])[1], " that appears in both ", inputs[nrow(x)], " and ", - inputs[length(R) + nrow(x)], ". This is an error. The files should have at least one code of ", - colnames(RRR[[length(R) + nrow(x)]])[1], " in common to allow the generation of the candidate correspondence table.", - sep = ""))) - } - - } - - if (k >= 2) { - - # A in A appears in A:C1 - if (sum(!is.na(match(unlist(RRR[[1]][, 1]), R[[1]][, 1]))) == 0) { - stop(simpleError(paste("There is no code of ", colnames(RRR[[1]])[1], - " that appears in both ", inputs[1], " and ", inputs[1 + nrow(x)], - ". This is an error. The files should have at least one code of ", - colnames(RRR[[1]])[1], " in common to allow the generation of the candidate correspondence table.", - sep = ""))) - } - - # C1 in A:C1 appears in C1:C2 C2 in C1:C2 appears in C2:C3 ... - for (i in 1:(k - 1)) { - - if (sum(!is.na(match(R[[i]][, 2], R[[i + 1]][, 1]))) == 0) { - stop(simpleError(paste("There is no code of ", colnames(RRR[[i + - nrow(x)]])[2], " that appears in both ", inputs[i + nrow(x)], " and ", - inputs[i + 1 + nrow(x)], ". This is an error. The files should have at least one code of ", - colnames(RRR[[i + nrow(x)]])[2], " in common to allow the generation of the candidate correspondence table.", - sep = ""))) - } - - } - - # Ck in C(k-1):Ck appears in B:Ck - if (sum(!is.na(match(R[[k]][, 2], R[[k + 1]][, 2]))) == 0) { - stop(simpleError(paste("There is no code of ", colnames(RRR[[k + nrow(x)]])[2], - " that appears in both ", inputs[k + nrow(x)], " and ", inputs[k + - 1 + nrow(x)], ". This is an error. The files should have at least one code of ", - colnames(RRR[[k + nrow(x)]])[2], " in common to allow the generation of the candidate correspondence table.", - sep = ""))) - } - - # B in B:Ck appears in B - if (sum(!is.na(match(R[[length(R)]][, 1], unlist(RRR[[nrow(x)]][, 1])))) == 0) { - stop(simpleError(paste("There is no code of ", colnames(RRR[[length(R) + - nrow(x)]])[1], " that appears in both ", inputs[nrow(x)], " and ", - inputs[length(R) + nrow(x)], ". This is an error. The files should have at least one code of ", - colnames(RRR[[length(R) + nrow(x)]])[1], " in common to allow the generation of the candidate correspondence table.", - sep = ""))) - } - - } - - # Warning - if (k == 1) { - - # C1 in C1 appears in A:C1 - if (sum(!is.na(match(unlist(RRR[[2]][, 1]), R[[1]][, 2]))) == 0) { - message(paste("WARNING: there is no code of ", colnames(RRR[[2]])[1], " that appears in both ", - inputs[2], " and ", inputs[1 + nrow(x)], ". When the execution of the function is over, please check the files to ensure that this is not the result of a mistake in their preparation or declaration.\n", - sep = "")) - } - - # C1 in C1 appears in B:C1 - if (sum(!is.na(match(unlist(RRR[[2]][, 1]), R[[2]][, 2]))) == 0) { - message(paste("WARNING: there is no code of ", colnames(RRR[[2]])[1], " that appears in both ", - inputs[2], " and ", inputs[2 + nrow(x)], ". When the execution of the function is over, please check the files to ensure that this is not the result of a mistake in their preparation or declaration.\n", - sep = "")) - } - - } - - if (k == 2) { - - for (i in 2:k) { - - # C1 in C1 appears in A:C1 C2 in C2 appears in C1:C2 C3 in C3 - # appears in C2:C3 - if (sum(!is.na(match(unlist(RRR[[i]][, 1]), R[[i - 1]][, 2]))) == 0) { - message(paste("WARNING: there is no code of ", colnames(RRR[[i]])[1], - " that appears in both ", inputs[i], " and ", inputs[i - 1 + nrow(x)], - ". When the execution of the function is over, please check the files to ensure that this is not the result of a mistake in their preparation or declaration.\n", - sep = "")) - } - - # C1 in C1 appears in C1:C2 C2 in C2 appears in C2:C3 C3 in C3 - # appears in C3:C4 - if (sum(!is.na(match(unlist(RRR[[i]][, 1]), R[[i]][, 1]))) == 0) { - message(paste("WARNING: there is no code of ", colnames(RRR[[i]])[1], - " that appears in both ", inputs[i], " and ", inputs[i + nrow(x)], - ". When the execution of the function is over, please check the files to ensure that this is not the result of a mistake in their preparation or declaration.\n", - sep = "")) - } - } - - # Ck in Ck appears in C(k-1):Ck - if (sum(!is.na(match(unlist(RRR[[k + 1]][, 1]), R[[k]][, 2]))) == 0) { - message(paste("WARNING: there is no code of ", colnames(RRR[[k + 1]])[1], - " that appears in both ", inputs[k + 1], " and ", inputs[k + nrow(x)], - ". When the execution of the function is over, please check the files to ensure that this is not the result of a mistake in their preparation or declaration.\n", - sep = "")) - } - - # Ck in Ck appears in B:Ck - if (sum(!is.na(match(unlist(RRR[[k + 1]][, 1]), R[[k + 1]][, 2]))) == 0) { - message(paste("WARNING: there is no code of ", colnames(RRR[[k + 1]])[1], - " that appears in both ", inputs[k + 1], " and ", inputs[k + 1 + - nrow(x)], ". When the execution of the function is over, please check the files to ensure that this is not the result of a mistake in their preparation or declaration.\n", - sep = "")) - } - - } - - # Create the final correspondence table moving from the classification A to - # the classification B. - tryCatch({ - - F_AtoB <- list() - - # The following if statement is used when we have only the - # correspondence tables A:C1 and B:C1. - counter <- 0 - if (length(R) == 2) { - #creating a progress bar - message("Percentage of codes of ", colnames(RRR[[1]][1]), " processed:") - pb <- txtProgressBar(min = 0, max = 100, style = 3, width = 50, char = "=") - - # The following for loop creates the desirable correspondence - # table. The operations are conducted for each unique element of - # classification A of the correspondence table A:C1. - for (i in unique(R[[1]][, 1])) { - - # Print the percentage of codes that have been processed. - counter <- counter + 1 - setTxtProgressBar(pb, round(counter/length(unique(R[[1]][, 1])) * 100, digits = 0)) - # Matrix TT contains the rows of correspondence table A:C1 for - # a specific element of classification A. Matrix T contains - # the rows of correspondence table B:C1 that match with the - # specific element of classification A based on classification - # C1. - x1 <- R[[1]][which(R[[1]][, 1] == i), 2] - TT <- matrix(R[[1]][which(R[[1]][, 1] == i), 1:2], ncol = 2) - T <- matrix(R[[2]][!is.na(match(R[[2]][, 2], x1)), 1:2], ncol = 2) - - # Create a list whose each element is a matrix that contains - # all unique rows of matrix T based on the elements of - # classification C1. - t <- match(T[, 2], T[, 2]) - v1 <- sequence(rle(sort(t))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z <- lapply(v1, function(x) { - T[order(t)[x], , drop = FALSE] - }) - - # Create a list whose each element is a matrix that contains - # all unique rows of matrix TT that match with the unique - # elements of the second column of matrix T. - t1 <- match(TT[, 2], T[, 2]) - v1 <- sequence(rle(sort(t1))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z1 <- lapply(v1, function(x) { - TT[order(t1)[x], , drop = FALSE] - }) - - # Keep matrices in Z that exist in Z1 based on their second - # columns (elements of classification C1). - Z <- Z[!is.na(match(lapply(Z, function(x) { - unique(x[, 2]) - }), lapply(Z1, function(x) { - unique(x[, 2]) - })))] - - # ZZ is a matrix that consists of matrices in Z1 expanded by - # their corresponding matrices (based on the elements of - # classification C1). - a <- lapply(Z, function(x) { - 1:nrow(x) - }) - a1 <- lapply(Z1, function(x) { - 1:nrow(x) - }) - aa <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z, Map(expand.grid, a, a1)), function(x) { - matrix(x, ncol = 2) - }) - aa1 <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z1, Map(expand.grid, a1, a)), function(x) { - matrix(x, ncol = 2) - }) - ZZ <- do.call(rbind, Map(cbind, aa1, aa)) - - # The records of A:C1 that do not exist in C1:C2 (in terms of - # the values of classification C1) are adjusted to ZZ which - # consists of records of A:C1 that exist in C1:C2 (in terms of - # the values of classification C1). - t1 <- matrix(TT[is.na(match(TT[, 2], ZZ[, 2])), ], ncol = 2) - ZZ <- rbind(ZZ, cbind(t1, matrix("", nrow = nrow(t1), ncol = 2))) - - F_AtoB[[counter]] <- ZZ - - } - } - - # The following if statement is used when we have only the - # correspondence tables A:C1, C1:C2 and B:C2. - if (length(R) == 3) { - #creating a progress bar - message("Percentage of codes of ", colnames(RRR[[1]][1]), " processed:") - pb <- txtProgressBar(min = 0, max = 100, style = 3, width = 50, char = "=") - - - # The following for loop creates the desirable correspondence - # table. The operations are conducted for each unique element of - # classification A of the correspondence table A:C1. - for (i in unique(R[[1]][, 1])) { - counter <- counter + 1 - setTxtProgressBar(pb, round(counter/length(unique(R[[1]][, 1])) * 100, digits = 0)) - - # Matrix T contains the rows of correspondence table C1:C2 that - # match with the specific element of classification A based on - # classification C1. - x1 <- R[[1]][which(R[[1]][, 1] == i), 2] - T <- matrix(R[[2]][!is.na(match(R[[2]][, 1], x1)), 1:2], ncol = 2) - - # The records of A:C1 that do not exist in C1:C2 (in terms of - # the values of classification C1). - if (length(which(is.na(match(x1, T[, 1])) == TRUE)) > 0) { - M1 <- matrix(matrix(R[[1]][which(R[[1]][, 1] == i), 1:2], ncol = 2)[is.na(match(x1, - T[, 1])), ], ncol = 2) - } else { - M1 = matrix(0, 1, 2 * length(R)) - M1 = M1[FALSE, ] - } - - if (nrow(M1) != 0) { - for (times in 1:(2 * length(R) - ncol(M1))) { - - M1 <- cbind(M1, "") - - } - } - - # Matrix TT contains the rows of correspondence table B:C2 that - # match with the specific element of classification A based on - # classification C1. - x2 <- R[[2]][!is.na(match(R[[2]][, 1], x1)), 2] - T1 <- matrix(R[[3]][!is.na(match(R[[3]][, 2], x2)), 1:2], ncol = 2) - - # The records of C1:C2 that do not exist in B:C2 (in terms of - # the values of classification C2). - if (length(which(is.na(match(x2, T1[, 2])) == TRUE)) > 0) { - if (length(which(is.na(match(x2, T1[, 2])) == TRUE)) == 1) { - M2 <- matrix(c(i, T[is.na(match(x2, T1[, 2])), 1], T[is.na(match(x2, - T1[, 2])), ]), ncol = 4) - } else { - M2 <- cbind(i, T[is.na(match(x2, T1[, 2])), 1], T[is.na(match(x2, - T1[, 2])), ]) - } - } else { - M2 = matrix(0, 1, 2 * length(R)) - M2 = M2[FALSE, ] - } - - if (nrow(M2) != 0) { - for (times in 1:(2 * length(R) - ncol(M2))) { - - M2 <- cbind(M2, "") - - } - } - - # Create a list whose each element is a matrix that contains - # all unique rows of matrix T based on the elements of - # classification C1. - t <- match(T[, 2], T[, 2]) - v1 <- sequence(rle(sort(t))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z <- lapply(v1, function(x) { - T[order(t)[x], , drop = FALSE] - }) - - # Create a list whose each element is a matrix that contains - # all unique rows of matrix TT that match with the unique - # elements of the second column of matrix T. - t1 <- match(T1[, 2], T[, 2]) - v1 <- sequence(rle(sort(t1))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z1 <- lapply(v1, function(x) { - T1[order(t1)[x], , drop = FALSE] - }) - - # Keep matrices in Z that exist in Z1 based on their second - # columns (elements of classification C1). - Z <- Z[!is.na(match(lapply(Z, function(x) { - unique(x[, 2]) - }), lapply(Z1, function(x) { - unique(x[, 2]) - })))] - - # ZZ is a matrix that consists of matrices in Z1 expanded by - # their corresponding matrices (based on the elements of - # classification C1). - a <- lapply(Z, function(x) { - 1:nrow(x) - }) - a1 <- lapply(Z1, function(x) { - 1:nrow(x) - }) - aa <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z, Map(expand.grid, a, a1)), function(x) { - matrix(x, ncol = 2) - }) - aa1 <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z1, Map(expand.grid, a1, a)), function(x) { - matrix(x, ncol = 2) - }) - ZZ <- do.call(rbind, Map(cbind, aa, aa1)) - - # The records of both M1 and M2 are adjusted to ZZ which - # consists of records of A:C1 that exist in C1:C2 (in terms of - # the values of classification C1). - if (is.null(dim(ZZ))) { - F_AtoB[[counter]] <- rbind(M1, M2) - } else { - F_AtoB[[counter]] <- rbind(cbind(i, ZZ[, 1], ZZ), M1, M2) - } - - } - - } - - # The following if statement is used in the general situation, in which - # we have the correspondence tables A:C1, Ci:C(i+1) for i = 1, ..., - # (k-1) Ci and B:Ck. - M <- list() - if (length(R) >= 4) { - message("Percentage of codes of ", colnames(RRR[[1]][1]), " processed:") - pb <- txtProgressBar(min = 0, max = 100, style = 3, width = 50, char = "=") - - - # The following for loop creates the desirable correspondence - # table. The operations are conducted for each unique element of - # classification A of the correspondence table A:C1. - for (i in unique(R[[1]][, 1])) { - - counter <- counter + 1 - setTxtProgressBar(pb, round(counter/length(unique(R[[1]][, 1])) * 100, digits = 0)) - - - for (j in 1:(length(R) - 2)) { - # The same operations as in the case that we have only the - # correspondence tables A:C1 and B:C1, but here for the - # correspondence tables C1:C2 and C2:C3. - if (j == 1) { - - x1 <- R[[j]][which(R[[j]][, 1] == i), 2] - T <- matrix(R[[j + 1]][!is.na(match(R[[j + 1]][, 1], x1)), 1:2], - ncol = 2) - - # The records of A:C1 that do not exist in C1:C2 (in terms - # of the values of classification C1) - - if (length(which(is.na(match(x1, T[, 1])) == TRUE)) > 0) { - M1 <- matrix(matrix(R[[j]][which(R[[j]][, 1] == i), 1:2], ncol = 2)[is.na(match(x1, - T[, 1])), ], ncol = 2) - } else { - M1 = matrix(0, 1, 2 * length(R)) - M1 = M1[FALSE, ] - } - - if (nrow(M1) != 0) { - for (times in 1:(2 * length(R) - ncol(M1))) { - - M1 <- cbind(M1, "") - - } - } - - x2 <- R[[j + 1]][!is.na(match(R[[j + 1]][, 1], x1)), 2] - T1 <- matrix(R[[j + 2]][!is.na(match(R[[j + 2]][, 1], x2)), 1:2], - ncol = 2) - - if (length(which(is.na(match(x2, T1[, 1])) == TRUE)) > 0) { - - if (length(which(is.na(match(x2, T1[, 1])) == TRUE)) == 1) { - M2 <- matrix(c(i, T[is.na(match(x2, T1[, 1])), 1], T[is.na(match(x2, - T1[, 1])), ]), ncol = 4) - - } else { - M2 <- cbind(i, T[is.na(match(x2, T1[, 1])), 1], T[is.na(match(x2, - T1[, 1])), ]) - } - } else { - M2 = matrix(0, 1, 2 * length(R)) - M2 = M2[FALSE, ] - } - - if (nrow(M2) != 0) { - for (times in 1:(2 * length(R) - ncol(M2))) { - - M2 <- cbind(M2, "") - - } - } - - t <- match(T[, 2], T[, 2]) - v1 <- sequence(rle(sort(t))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z <- lapply(v1, function(x) { - T[order(t)[x], , drop = FALSE] - }) - - t1 <- match(T1[, 1], T[, 2]) - v1 <- sequence(rle(sort(t1))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z1 <- lapply(v1, function(x) { - T1[order(t1)[x], , drop = FALSE] - }) - - Z <- Z[!is.na(match(lapply(Z, function(x) { - unique(x[, 2]) - }), lapply(Z1, function(x) { - unique(x[, 1]) - })))] - - a <- lapply(Z, function(x) { - 1:nrow(x) - }) - a1 <- lapply(Z1, function(x) { - 1:nrow(x) - }) - aa <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z, Map(expand.grid, a, a1)), function(x) { - matrix(x, ncol = 2) - }) - aa1 <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z1, Map(expand.grid, a1, a)), function(x) { - matrix(x, ncol = 2) - }) - ZZ <- do.call(rbind, Map(cbind, aa, aa1)) - - } - - # The same operations as in the case that we have only the - # correspondence tables A:C1 and B:C1, but here for the pairs - # of correspondence tables (C2:C3 - C3:C4), (C3:C4 - C4:C5), - # ..., (C(k-2):C(k-1) - C(k-1):Ck). For each value of j that - # satisfies the if statement, the previous matrix ZZ created - # is used. For j = 2, the matrix ZZ created in the previous - # if statement is used. - if (j >= 2 && j <= (length(R) - 3) && length(R) != 4) { - - t <- match(ZZ[, ncol(ZZ)], ZZ[, ncol(ZZ)]) - v1 <- sequence(rle(sort(t))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z <- lapply(v1, function(x) { - ZZ[order(t)[x], , drop = FALSE] - }) - - t1 <- match(R[[j + 2]][, 1], ZZ[, ncol(ZZ)]) - v1 <- sequence(rle(sort(t1))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z1 <- lapply(v1, function(x) { - R[[j + 2]][order(t1)[x], 1:2, drop = FALSE] - }) - - if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])) == - TRUE)) > 0) { - if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])) == - TRUE)) == 1) { - M3 <- matrix(c(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, - 1])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])), - ]), ncol = ncol(ZZ) + 2) - } else { - M3 <- cbind(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, - 1])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])), - ]) - } - } else { - M3 = matrix(0, 1, 2 * length(R)) - M3 = M3[FALSE, ] - } - - if (nrow(M3) != 0) { - for (times in 1:(2 * length(R) - ncol(M3))) { - - M3 <- cbind(M3, "") - - } - } - - M[[j - 1]] <- M3 - - Z <- Z[!is.na(match(lapply(Z, function(x) { - unique(x[, ncol(ZZ)]) - }), lapply(Z1, function(x) { - unique(x[, 1]) - })))] - - a <- lapply(Z, function(x) { - 1:nrow(x) - }) - a1 <- lapply(Z1, function(x) { - 1:nrow(x) - }) - - aa <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z, Map(expand.grid, a, a1)), function(x) { - matrix(x, ncol = ncol(ZZ)) - }) - aa1 <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z1, Map(expand.grid, a1, a)), function(x) { - matrix(x, ncol = 2) - }) - - ZZ <- do.call(rbind, Map(cbind, aa, aa1)) - - } - - # The same operations as in the case that we have only the - # correspondence tables A:C1 and B:C1, but here for the - # correspondence tables C(k-1):Ck and B:Ck. For the value of - # j that satisfies the if statement, the matrix ZZ created in - # the previous if statement is used. - if (j == (length(R) - 2)) { - - t <- match(ZZ[, ncol(ZZ)], ZZ[, ncol(ZZ)]) - v1 <- sequence(rle(sort(t))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z <- lapply(v1, function(x) { - ZZ[order(t)[x], , drop = FALSE] - }) - - t1 <- match(R[[length(R)]][, 2], ZZ[, ncol(ZZ)]) - v1 <- sequence(rle(sort(t1))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z1 <- lapply(v1, function(x) { - R[[length(R)]][order(t1)[x], 1:2, drop = FALSE] - }) - - if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, - 2])) == TRUE)) > 0) { - if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, - 2])) == TRUE)) == 1) { - M4 <- matrix(c(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, - 2])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, - 2])), ]), ncol = ncol(ZZ) + 2) - } else { - M4 <- cbind(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, - 2])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, - 2])), ]) - } - } else { - M4 = matrix(0, 1, 2 * length(R)) - M4 = M4[FALSE, ] - } - - - if (nrow(M4) != 0) { - for (times in 1:(2 * length(R) - ncol(M4))) { - - M4 <- cbind(M4, "") - - } - } - - Z <- Z[!is.na(match(lapply(Z, function(x) { - unique(x[, ncol(ZZ)]) - }), lapply(Z1, function(x) { - unique(x[, 2]) - })))] - - a <- lapply(Z, function(x) { - 1:nrow(x) - }) - a1 <- lapply(Z1, function(x) { - 1:nrow(x) - }) - - aa <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z, Map(expand.grid, a, a1)), function(x) { - matrix(x, ncol = ncol(ZZ)) - }) - aa1 <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z1, Map(expand.grid, a1, a)), function(x) { - matrix(x, ncol = 2) - }) - - ZZ <- do.call(rbind, Map(cbind, aa, aa1)) - - } - } - - if (is.null(dim(ZZ))) { - F_AtoB[[counter]] <- rbind(M1, M2, do.call(rbind, M), M4) - } else { - F_AtoB[[counter]] <- rbind(cbind(i, ZZ[, 1], ZZ), M1, M2, do.call(rbind, - M), M4) - } - } - } - - # Create the desired correspondence table for the selected element of - # classification A. - F_AtoB <- do.call(rbind, F_AtoB) - - # Keep in F the classifications A, C1, C2, ..., Ck, B once, based on - # the number of the correspondence tables. - if (length(R) == 2) { - F_AtoB <- F_AtoB[, c(1, 2, 3)] - } - if (length(R) == 3) { - F_AtoB <- F_AtoB[, c(1, 2, 4, 5)] - } - if (length(R) >= 4) { - F_AtoB <- F_AtoB[, sort(c(1, seq(2, 2 * length(R) - 2, 2), 2 * length(R) - - 1))] - } - - # Convert classifications as well as correspondence tables so as to - # move from classification B to classification A. Until the next - # comment, all the lines are the same as in the case that we move from - # classification A to classification B. - RRR_BtoA <- RRR[c(rev(1:(k + 2)), rev(utils::tail(c(1:length(RRR)), (length(RRR) - - 1)/2)))] - if (length(rev(utils::tail(c(1:length(RR)), (length(RR) - 1)/2))) >= 3) { - for (rev in (k + 4):(length(RRR_BtoA) - 1)) { - column_2 <- RRR_BtoA[[rev]][, 2] - RRR_BtoA[[rev]][, 2] <- RRR_BtoA[[rev]][, 1] - RRR_BtoA[[rev]][, 1] <- column_2 - } - } - - RR <- lapply(RRR_BtoA, function(x) { - matrix(unlist(x), ncol = ncol(x)) - }) - - R <- RR[utils::tail(c(1:length(RR)), (length(RR) - 1)/2)] - - F_BtoA <- list() - - counter <- 0 - message("\n") - if (length(R) == 2) { - - message("Percentage of codes of ", colnames(RRR_BtoA[[1]][1]), " processed:") - pb <- txtProgressBar(min = 0, max = 100, style = 3, width = 50, char = "=") - - for (i in unique(R[[1]][, 1])) { - - counter <- counter + 1 - setTxtProgressBar(pb, round(counter/length(unique(R[[1]][, 1])) * 100, digits = 0)) - - x1 <- R[[1]][which(R[[1]][, 1] == i), 2] - TT <- matrix(R[[1]][which(R[[1]][, 1] == i), 1:2], ncol = 2) - T <- matrix(R[[2]][!is.na(match(R[[2]][, 2], x1)), 1:2], ncol = 2) - - t <- match(T[, 2], T[, 2]) - v1 <- sequence(rle(sort(t))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z <- lapply(v1, function(x) { - T[order(t)[x], , drop = FALSE] - }) - - t1 <- match(TT[, 2], T[, 2]) - v1 <- sequence(rle(sort(t1))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z1 <- lapply(v1, function(x) { - TT[order(t1)[x], , drop = FALSE] - }) - - Z <- Z[!is.na(match(lapply(Z, function(x) { - unique(x[, 2]) - }), lapply(Z1, function(x) { - unique(x[, 2]) - })))] - - a <- lapply(Z, function(x) { - 1:nrow(x) - }) - a1 <- lapply(Z1, function(x) { - 1:nrow(x) - }) - aa <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z, Map(expand.grid, a, a1)), function(x) { - matrix(x, ncol = 2) - }) - aa1 <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z1, Map(expand.grid, a1, a)), function(x) { - matrix(x, ncol = 2) - }) - ZZ <- do.call(rbind, Map(cbind, aa1, aa)) - - t1 <- matrix(TT[is.na(match(TT[, 2], ZZ[, 2])), ], ncol = 2) - ZZ <- rbind(ZZ, cbind(t1, matrix("", nrow = nrow(t1), ncol = 2))) - - F_BtoA[[counter]] <- ZZ - - } - } - - if (length(R) == 3) { - message("Percentage of codes of ", colnames(RRR_BtoA[[1]][1]), " processed:") - pb <- txtProgressBar(min = 0, max = 100, style = 3, width = 50, char = "=") - for (i in unique(R[[1]][, 1])) { - - counter <- counter + 1 - setTxtProgressBar(pb, round(counter/length(unique(R[[1]][, 1])) * 100, digits = 0)) - - x1 <- R[[1]][which(R[[1]][, 1] == i), 2] - T <- matrix(R[[2]][!is.na(match(R[[2]][, 1], x1)), 1:2], ncol = 2) - - if (length(which(is.na(match(x1, T[, 1])) == TRUE)) > 0) { - M1 <- matrix(matrix(R[[1]][which(R[[1]][, 1] == i), 1:2], ncol = 2)[is.na(match(x1, - T[, 1])), ], ncol = 2) - } else { - M1 = matrix(0, 1, 2 * length(R)) - M1 = M1[FALSE, ] - } - - if (nrow(M1) != 0) { - for (times in 1:(2 * length(R) - ncol(M1))) { - - M1 <- cbind(M1, "") - - } - } - - x2 <- R[[2]][!is.na(match(R[[2]][, 1], x1)), 2] - T1 <- matrix(R[[3]][!is.na(match(R[[3]][, 2], x2)), 1:2], ncol = 2) - - if (length(which(is.na(match(x2, T1[, 2])) == TRUE)) > 0) { - if (length(which(is.na(match(x2, T1[, 2])) == TRUE)) == 1) { - M2 <- matrix(c(i, T[is.na(match(x2, T1[, 2])), 1], T[is.na(match(x2, - T1[, 2])), ]), ncol = 4) - } else { - M2 <- cbind(i, T[is.na(match(x2, T1[, 2])), 1], T[is.na(match(x2, - T1[, 2])), ]) - } - } else { - M2 = matrix(0, 1, 2 * length(R)) - M2 = M2[FALSE, ] - } - - if (nrow(M2) != 0) { - for (times in 1:(2 * length(R) - ncol(M2))) { - - M2 <- cbind(M2, "") - - } - } - - t <- match(T[, 2], T[, 2]) - v1 <- sequence(rle(sort(t))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z <- lapply(v1, function(x) { - T[order(t)[x], , drop = FALSE] - }) - - t1 <- match(T1[, 2], T[, 2]) - v1 <- sequence(rle(sort(t1))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z1 <- lapply(v1, function(x) { - T1[order(t1)[x], , drop = FALSE] - }) - - Z <- Z[!is.na(match(lapply(Z, function(x) { - unique(x[, 2]) - }), lapply(Z1, function(x) { - unique(x[, 2]) - })))] - - a <- lapply(Z, function(x) { - 1:nrow(x) - }) - a1 <- lapply(Z1, function(x) { - 1:nrow(x) - }) - aa <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z, Map(expand.grid, a, a1)), function(x) { - matrix(x, ncol = 2) - }) - aa1 <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z1, Map(expand.grid, a1, a)), function(x) { - matrix(x, ncol = 2) - }) - ZZ <- do.call(rbind, Map(cbind, aa, aa1)) - - if (is.null(dim(ZZ))) { - F_BtoA[[counter]] <- rbind(M1, M2) - } else { - F_BtoA[[counter]] <- rbind(cbind(i, ZZ[, 1], ZZ), M1, M2) - } - - } - - } - M <- list() - if (length(R) >= 4) { - message("Percentage of codes of ", colnames(RRR_BtoA[[1]][1]), " processed:") - pb <- txtProgressBar(min = 0, max = 100, style = 3, width = 50, char = "=") - for (i in unique(R[[1]][, 1])) { - - counter <- counter + 1 - setTxtProgressBar(pb, round(counter/length(unique(R[[1]][, 1])) * 100, digits = 0)) - - for (j in 1:(length(R) - 2)) { - if (j == 1) { - - x1 <- R[[j]][which(R[[j]][, 1] == i), 2] - T <- matrix(R[[j + 1]][!is.na(match(R[[j + 1]][, 1], x1)), 1:2], - ncol = 2) - - if (length(which(is.na(match(x1, T[, 1])) == TRUE)) > 0) { - M1 <- matrix(matrix(R[[j]][which(R[[j]][, 1] == i), 1:2], ncol = 2)[is.na(match(x1, - T[, 1])), ], ncol = 2) - } else { - M1 = matrix(0, 1, 2 * length(R)) - M1 = M1[FALSE, ] - } - - if (nrow(M1) != 0) { - for (times in 1:(2 * length(R) - ncol(M1))) { - - M1 <- cbind(M1, "") - - } - } - - x2 <- R[[j + 1]][!is.na(match(R[[j + 1]][, 1], x1)), 2] - T1 <- matrix(R[[j + 2]][!is.na(match(R[[j + 2]][, 1], x2)), 1:2], - ncol = 2) - - if (length(which(is.na(match(x2, T1[, 1])) == TRUE)) > 0) { - - if (length(which(is.na(match(x2, T1[, 1])) == TRUE)) == 1) { - M2 <- matrix(c(i, T[is.na(match(x2, T1[, 1])), 1], T[is.na(match(x2, - T1[, 1])), ]), ncol = 4) - - } else { - M2 <- cbind(i, T[is.na(match(x2, T1[, 1])), 1], T[is.na(match(x2, - T1[, 1])), ]) - } - } else { - M2 = matrix(0, 1, 2 * length(R)) - M2 = M2[FALSE, ] - } - - if (nrow(M2) != 0) { - for (times in 1:(2 * length(R) - ncol(M2))) { - - M2 <- cbind(M2, "") - - } - } - - t <- match(T[, 2], T[, 2]) - v1 <- sequence(rle(sort(t))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z <- lapply(v1, function(x) { - T[order(t)[x], , drop = FALSE] - }) - - t1 <- match(T1[, 1], T[, 2]) - v1 <- sequence(rle(sort(t1))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z1 <- lapply(v1, function(x) { - T1[order(t1)[x], , drop = FALSE] - }) - - Z <- Z[!is.na(match(lapply(Z, function(x) { - unique(x[, 2]) - }), lapply(Z1, function(x) { - unique(x[, 1]) - })))] - - a <- lapply(Z, function(x) { - 1:nrow(x) - }) - a1 <- lapply(Z1, function(x) { - 1:nrow(x) - }) - aa <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z, Map(expand.grid, a, a1)), function(x) { - matrix(x, ncol = 2) - }) - aa1 <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z1, Map(expand.grid, a1, a)), function(x) { - matrix(x, ncol = 2) - }) - ZZ <- do.call(rbind, Map(cbind, aa, aa1)) - - } - - if (j >= 2 && j <= (length(R) - 3) && length(R) != 4) { - - t <- match(ZZ[, ncol(ZZ)], ZZ[, ncol(ZZ)]) - v1 <- sequence(rle(sort(t))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z <- lapply(v1, function(x) { - ZZ[order(t)[x], , drop = FALSE] - }) - - t1 <- match(R[[j + 2]][, 1], ZZ[, ncol(ZZ)]) - v1 <- sequence(rle(sort(t1))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z1 <- lapply(v1, function(x) { - R[[j + 2]][order(t1)[x], 1:2, drop = FALSE] - }) - - if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])) == - TRUE)) > 0) { - if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])) == - TRUE)) == 1) { - M3 <- matrix(c(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, - 1])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])), - ]), ncol = ncol(ZZ) + 2) - } else { - M3 <- cbind(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, - 1])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])), - ]) - } - } else { - M3 = matrix(0, 1, 2 * length(R)) - M3 = M3[FALSE, ] - } - - if (nrow(M3) != 0) { - for (times in 1:(2 * length(R) - ncol(M3))) { - - M3 <- cbind(M3, "") - - } - } - M[[j - 1]] <- M3 - - Z <- Z[!is.na(match(lapply(Z, function(x) { - unique(x[, ncol(ZZ)]) - }), lapply(Z1, function(x) { - unique(x[, 1]) - })))] - - a <- lapply(Z, function(x) { - 1:nrow(x) - }) - a1 <- lapply(Z1, function(x) { - 1:nrow(x) - }) - - aa <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z, Map(expand.grid, a, a1)), function(x) { - matrix(x, ncol = ncol(ZZ)) - }) - aa1 <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z1, Map(expand.grid, a1, a)), function(x) { - matrix(x, ncol = 2) - }) - - ZZ <- do.call(rbind, Map(cbind, aa, aa1)) - - } - - if (j == (length(R) - 2)) { - - t <- match(ZZ[, ncol(ZZ)], ZZ[, ncol(ZZ)]) - v1 <- sequence(rle(sort(t))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z <- lapply(v1, function(x) { - ZZ[order(t)[x], , drop = FALSE] - }) - - t1 <- match(R[[length(R)]][, 2], ZZ[, ncol(ZZ)]) - v1 <- sequence(rle(sort(t1))$lengths) - v1 <- split(seq_along(v1), cumsum(v1 == 1)) - Z1 <- lapply(v1, function(x) { - R[[length(R)]][order(t1)[x], 1:2, drop = FALSE] - }) - - if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, - 2])) == TRUE)) > 0) { - if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, - 2])) == TRUE)) == 1) { - M4 <- matrix(c(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, - 2])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, - 2])), ]), ncol = ncol(ZZ) + 2) - } else { - M4 <- cbind(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, - 2])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, - 2])), ]) - } - } else { - M4 = matrix(0, 1, 2 * length(R)) - M4 = M4[FALSE, ] - } - - - if (nrow(M4) != 0) { - for (times in 1:(2 * length(R) - ncol(M4))) { - - M4 <- cbind(M4, "") - - } - } - - Z <- Z[!is.na(match(lapply(Z, function(x) { - unique(x[, ncol(ZZ)]) - }), lapply(Z1, function(x) { - unique(x[, 2]) - })))] - - a <- lapply(Z, function(x) { - 1:nrow(x) - }) - a1 <- lapply(Z1, function(x) { - 1:nrow(x) - }) - - aa <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z, Map(expand.grid, a, a1)), function(x) { - matrix(x, ncol = ncol(ZZ)) - }) - aa1 <- lapply(Map(function(x, y) { - x[y[, 1], ] - }, Z1, Map(expand.grid, a1, a)), function(x) { - matrix(x, ncol = 2) - }) - - ZZ <- do.call(rbind, Map(cbind, aa, aa1)) - - } - } - - if (is.null(dim(ZZ))) { - F_BtoA[[counter]] <- rbind(M1, M2, do.call(rbind, M), M4) - } else { - F_BtoA[[counter]] <- rbind(cbind(i, ZZ[, 1], ZZ), M1, M2, do.call(rbind, - M), M4) - } - - - } - } - - F_BtoA <- do.call(rbind, F_BtoA) - - if (length(R) == 2) { - F_BtoA <- F_BtoA[, c(1, 2, 3)] - } - if (length(R) == 3) { - F_BtoA <- F_BtoA[, c(1, 2, 4, 5)] - } - if (length(R) >= 4) { - F_BtoA <- F_BtoA[, sort(c(1, seq(2, 2 * length(R) - 2, 2), 2 * length(R) - - 1))] - } - - - F_BtoA <- F_BtoA[, rev(1:ncol(F_BtoA))] - # Combine the results from moving from classification A to B, and vice - # versa. F_AtoB - keep <- 0 - keepF_AtoB <- c(0) - for (iterr in 1:nrow(F_AtoB)) { - - if (F_AtoB[iterr, 1] != "") { - blanks <- F_AtoB[iterr, ] == "" - - if (all(blanks == FALSE)) { - keep <- keep + 1 - keepF_AtoB[keep] <- iterr - } else { - blanks = which(F_AtoB[iterr, ] == "") - if (all(c(blanks[1]:ncol(F_AtoB)) == "")) { - keep <- keep + 1 - keepF_AtoB[keep] <- iterr - } - } - - } - } - - NoNullF_AtoB <- matrix(F_AtoB[keepF_AtoB, ], ncol = k + 2) - if (nrow(NoNullF_AtoB) != nrow(F_AtoB)) { - if (length(keepF_AtoB) == 1 && keepF_AtoB == c(0)) { - FNullAtoB <- matrix(F_AtoB, ncol = k + 2) - for (iter in 1:nrow(FNullAtoB)) { - FNullAtoB[iter, (which(FNullAtoB[iter, ] == "")[1]):(k + 2)] <- "" - } - } else { - FNullAtoB <- matrix(F_AtoB[-keepF_AtoB, ], ncol = k + 2) - for (iter in 1:nrow(FNullAtoB)) { - FNullAtoB[iter, (which(FNullAtoB[iter, ] == "")[1]):(k + 2)] <- "" - } - } - } else { - FNullAtoB <- matrix(0, 1, k + 2) - FNullAtoB <- FNullAtoB[FALSE, ] - } - - # F_BtoA - keep <- 0 - keepF_BtoA <- c(0) - for (iterr in 1:nrow(F_BtoA)) { - - if (F_BtoA[iterr, ncol(F_AtoB)] != "") { - blanks <- F_BtoA[iterr, ] == "" - - if (all(blanks == FALSE)) { - keep <- keep + 1 - keepF_BtoA[keep] <- iterr - } else { - blanks <- which(F_BtoA[iterr, ] == "") - if (all(c(1:length(blanks)) == "")) { - keep <- keep + 1 - keepF_BtoA[keep] <- iterr - } - } - - } - } - - # Combine all together - - NoNullF_BtoA <- matrix(F_BtoA[keepF_BtoA, ], ncol = k + 2) - if (nrow(NoNullF_BtoA) != nrow(F_BtoA)) { - if (length(keepF_BtoA) == 1 && keepF_BtoA == c(0)) { - FNullBtoA <- matrix(F_BtoA, ncol = k + 2) - for (iter in 1:nrow(FNullBtoA)) { - FNullBtoA[iter, (which(FNullBtoA[iter, ] == "")[length(which(FNullBtoA[iter, - ] == ""))]):1] <- "" - } - } else { - FNullBtoA <- matrix(F_BtoA[-keepF_BtoA, ], ncol = k + 2) - for (iter in 1:nrow(FNullBtoA)) { - FNullBtoA[iter, (which(FNullBtoA[iter, ] == "")[length(which(FNullBtoA[iter, - ] == ""))]):1] <- "" - } - } - } else { - FNullBtoA <- matrix(0, 1, k + 2) - FNullBtoA <- FNullBtoA[FALSE, ] - } - - F <- unique(rbind(NoNullF_AtoB, NoNullF_BtoA)) - F <- unique(rbind(F, unique(FNullAtoB), unique(FNullBtoA))) - if (length(which(apply(F, 1, function(x) { - length(which(x == "")) - } == k + 2) == TRUE)) >= 1) { - F <- F[-which(apply(F, 1, function(x) { - length(which(x == "")) - } == k + 2) == TRUE), ] - } - - # The if statement is based on which of classifications A or B is the - # reference one (if any). - if (length(which(apply(F, 1, function(x) { - length(which(x == "")) - }) == 0)) >= 1) { - - if (Reference == "A") { - idx <- k + 5 - - # Creation of the review flag for the correspondence table A:B. - F1 <- matrix(F[apply(F, 1, function(x) { - length(which(x == "")) - }) == 0, ], ncol = k + 2) - F2 <- F[apply(F, 1, function(x) { - length(which(x == "")) - }) >= 1, ] - F2 <- matrix(unlist(F2), ncol = k + 2) - f <- stats::aggregate(matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, 2], - list(num = matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, 2]), - length)[which(stats::aggregate(matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, - 2], list(num = matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, - 2]), length)[, 2] > 1), 1] - reviewF1 <- rep(0, nrow(F1)) - reviewF1[which(F1[, ncol(F1)] %in% f)] <- 1 - Review <- data.frame(cbind(rbind(F1, F2), c(reviewF1, rep(0, nrow(F2))))) - - # Creation of the redundancy flag for the correspondence table - # A:B. - F1 <- Review[apply(Review, 1, function(x) { - length(which(x == "")) - }) == 0, ] - F1 <- matrix(unlist(F1), ncol = k + 3) - F1 <- data.frame(F1) - colnames(F1) <- colnames(Review) - F2 <- Review[apply(Review, 1, function(x) { - length(which(x == "")) - }) >= 1, ] - F2 <- matrix(unlist(F2), ncol = k + 3) - F2 <- data.frame(F2) - colnames(F2) <- colnames(F1) - f1 <- stats::aggregate(F1[, c(1, ncol(F1) - 1)], by = F1[, c(1, ncol(F1) - - 1)], length)[1:(ncol(F1[, c(1, ncol(F1) - 1)]) + 1)][which(stats::aggregate(F1[, - c(1, ncol(F1) - 1)], by = F1[, c(1, ncol(F1) - 1)], length)[1:(ncol(F1[, - c(1, ncol(F1) - 1)]) + 1)][, 3] >= 2), 1:2] - redundancyF1 <- rep(0, nrow(F1)) - redundancyF1[which(apply(F1[, c(1, ncol(F1) - 1)], 1, paste, collapse = "") %in% - apply(f1, 1, paste, collapse = ""))] <- 1 - correspondenceAB <- data.frame(cbind(rbind(F1, F2), c(redundancyF1, - rep(0, nrow(F2))))) - - # Creation of the unmatched flag for the correspondence table - # A:B. - correspondenceAB <- data.frame(correspondenceAB, 1) - colnames(correspondenceAB) <- c(paste(colnames(RRR[[1]][1])), paste(unlist(lapply(RRR, - function(x) { - colnames(x)[1] - }))[seq(k) + 1]), paste(colnames(RRR[[k + 2]][1])), "Review", "Redundancy", - "Unmatched") - - } else if (Reference == "B") { - idx <- k + 5 - - # Creation of the review flag for the correspondence table A:B. - F1 <- matrix(F[apply(F, 1, function(x) { - length(which(x == "")) - }) == 0, ], ncol = k + 2) - F2 <- F[apply(F, 1, function(x) { - length(which(x == "")) - }) >= 1, ] - F2 <- matrix(unlist(F2), ncol = k + 2) - f <- stats::aggregate(matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, 1], - list(num = matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, 1]), - length)[which(stats::aggregate(matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, - 1], list(num = matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, - 1]), length)[, 2] > 1), 1] - reviewF1 <- rep(0, nrow(F1)) - reviewF1[which(F1[, 1] %in% f)] <- 1 - Review <- data.frame(cbind(rbind(F1, F2), c(reviewF1, rep(0, nrow(F2))))) - - # Creation of the redundancy flag for the correspondence table - # A:B. - F1 <- Review[apply(Review, 1, function(x) { - length(which(x == "")) - }) == 0, ] - F1 <- matrix(unlist(F1), ncol = k + 3) - F1 <- data.frame(F1) - colnames(F1) <- colnames(Review) - F2 <- Review[apply(Review, 1, function(x) { - length(which(x == "")) - }) >= 1, ] - F2 <- matrix(unlist(F2), ncol = k + 3) - F2 <- data.frame(F2) - colnames(F2) <- colnames(F1) - f1 <- stats::aggregate(F1[, c(1, ncol(F1) - 1)], by = F1[, c(1, ncol(F1) - - 1)], length)[1:(ncol(F1[, c(1, ncol(F1) - 1)]) + 1)][which(stats::aggregate(F1[, - c(1, ncol(F1) - 1)], by = F1[, c(1, ncol(F1) - 1)], length)[1:(ncol(F1[, - c(1, ncol(F1) - 1)]) + 1)][, 3] >= 2), 1:2] - redundancyF1 <- rep(0, nrow(F1)) - redundancyF1[which(apply(F1[, c(1, ncol(F1) - 1)], 1, paste, collapse = "") %in% - apply(f1, 1, paste, collapse = ""))] <- 1 - correspondenceAB <- data.frame(cbind(rbind(F1, F2), c(redundancyF1, - rep(0, nrow(F2))))) - - # Creation of the unmatched flag for the correspondence table - # A:B. - correspondenceAB <- data.frame(correspondenceAB, 1) - colnames(correspondenceAB) <- c(paste(colnames(RRR[[1]][1])), paste(unlist(lapply(RRR, - function(x) { - colnames(x)[1] - }))[seq(k) + 1]), paste(colnames(RRR[[k + 2]][1])), "Review", "Redundancy", - "Unmatched") - - } else if (Reference == "none") { - idx <- k + 4 - - # Creation of the redundancy flag for the correspondence table - # A:B. - F1 <- data.frame(F[apply(F, 1, function(x) { - length(which(x == "")) - }) == 0, ]) - F1 <- matrix(unlist(F1), ncol = k + 2) - F1 <- data.frame(F1) - F2 <- data.frame(F[apply(F, 1, function(x) { - length(which(x == "")) - }) >= 1, ]) - F2 <- matrix(unlist(F2), ncol = k + 2) - F2 <- data.frame(F2) - colnames(F2) <- colnames(F1) - f1 <- stats::aggregate(F1[, c(1, ncol(F1))], by = F1[, c(1, ncol(F1))], - length)[1:(ncol(F1[, c(1, ncol(F1))]) + 1)][which(stats::aggregate(F1[, - c(1, ncol(F1))], by = F1[, c(1, ncol(F1))], length)[1:(ncol(F1[, - c(1, ncol(F1))]) + 1)][, 3] >= 2), 1:2] - redundancyF1 <- rep(0, nrow(F1)) - redundancyF1[which(apply(F1[, c(1, ncol(F1))], 1, paste, collapse = "") %in% - apply(f1, 1, paste, collapse = ""))] <- 1 - correspondenceAB <- data.frame(cbind(rbind(F1, F2), c(redundancyF1, - rep(0, nrow(F2))))) - - # Creation of the unmatched flag for the correspondence table - # A:B. - correspondenceAB <- data.frame(correspondenceAB, 1) - colnames(correspondenceAB) <- c(paste(colnames(RRR[[1]][1])), paste(unlist(lapply(RRR, - function(x) { - colnames(x)[1] - }))[seq(k) + 1]), paste(colnames(RRR[[k + 2]][1])), "Redundancy", - "Unmatched") - - } - } else { - if (Reference %in% c("A", "B")) { - Review <- rep(0, nrow(F)) - Redundancy <- rep(0, nrow(F)) - Unmatched <- rep(1, nrow(F)) - correspondenceAB <- data.frame(cbind(F, Review, Redundancy, Unmatched)) - colnames(correspondenceAB) <- c(paste(colnames(RRR[[1]][1])), paste(unlist(lapply(RRR, - function(x) { - colnames(x)[1] - }))[seq(k) + 1]), paste(colnames(RRR[[k + 2]][1])), "Review", "Redundancy", - "Unmatched") - } - if (Reference == "none") { - Redundancy <- rep(0, nrow(F)) - Unmatched <- rep(1, nrow(F)) - correspondenceAB <- data.frame(cbind(F, Redundancy, Unmatched)) - colnames(correspondenceAB) <- c(paste(colnames(RRR[[1]][1])), paste(unlist(lapply(RRR, - function(x) { - colnames(x)[1] - }))[seq(k) + 1]), paste(colnames(RRR[[k + 2]][1])), "Redundancy", - "Unmatched") - } - } - - - # The final Unmatched and the NoMatchFrom flags are created - NoMatchFromA <- rep("", nrow(correspondenceAB)) - NoMatchFromB <- rep("", nrow(correspondenceAB)) - correspondenceAB <- cbind(correspondenceAB, NoMatchFromA, NoMatchFromB) - - inA <- which(is.na(match(unlist(RRR[[1]][, 1]), correspondenceAB[, 1])) == TRUE) - if (length(inA) >= 1) { - InA <- cbind(matrix(RRR[[1]][inA, 1], length(inA), 1), matrix("", length(inA), - idx - 1)) - InA <- cbind(InA, matrix("", length(inA), 2)) - InA <- data.frame(InA) - colnames(InA) <- colnames(correspondenceAB) - correspondenceAB <- rbind(correspondenceAB, InA) - } - - inB <- which(is.na(match(unlist(RRR[[nrow(x)]][, 1]), correspondenceAB[, k + 2])) == - TRUE) - if (length(inB) >= 1) { - InB <- cbind(matrix("", length(inB), k + 1), matrix(RRR[[nrow(x)]][inB, - 1], length(inB), 1), matrix("", length(inB), idx - k - 2)) - InB <- cbind(InB, matrix("", length(inB), 2)) - InB <- data.frame(InB) - colnames(InB) <- colnames(correspondenceAB) - correspondenceAB <- rbind(correspondenceAB, InB) - } - - yesA <- which(!is.na(match(correspondenceAB[, 1], unlist(RRR[[1]][, 1]))) == TRUE) - yesAC1 <- which(!is.na(match(correspondenceAB[, 1], unlist(RRR[[nrow(x) + 1]][, - 1]))) == TRUE) - noAC1 <- which(is.na(match(correspondenceAB[, 1], unlist(RRR[[nrow(x) + 1]][, 1]))) == - TRUE) - - correspondenceAB$NoMatchFromA[intersect(yesA, yesAC1)] <- 0 - correspondenceAB$NoMatchFromA[intersect(yesA, noAC1)] <- 1 - - yesB <- which(!is.na(match(correspondenceAB[, k + 2], unlist(RRR[[nrow(x)]][, 1]))) == - TRUE) - yesBCk <- which(!is.na(match(correspondenceAB[, k + 2], unlist(RRR[[length(RRR)]][, - 1]))) == TRUE) - noBCk <- which(is.na(match(correspondenceAB[, k + 2], unlist(RRR[[length(RRR)]][, - 1]))) == TRUE) - - correspondenceAB$NoMatchFromB[intersect(yesB, yesBCk)] <- 0 - correspondenceAB$NoMatchFromB[intersect(yesB, noBCk)] <- 1 - - yesFinalA <- which(correspondenceAB[, 1] != "") - yesFinalB <- which(correspondenceAB[, k + 2] != "") - correspondenceAB$Unmatched <- 1 - correspondenceAB$Unmatched[intersect(yesFinalA, yesFinalB)] <- 0 - - if ((Reference %in% c("A", "B"))) { - correspondenceAB$Review[which(correspondenceAB[, 1] == "")] <- "" - correspondenceAB$Review[which(correspondenceAB[, k + 2] == "")] <- "" - } - - # Final redundancy flag - correspondenceAB$Redundancy <- 0 - f1 <- stats::aggregate(correspondenceAB[, c(1, k + 2)], by = correspondenceAB[, - c(1, k + 2)], length)[1:(ncol(correspondenceAB[, c(1, k + 2)]) + 1)][which(stats::aggregate(correspondenceAB[, - c(1, k + 2)], by = correspondenceAB[, c(1, k + 2)], length)[1:(ncol(correspondenceAB[, - c(1, k + 2)]) + 1)][, 3] >= 2), 1:2] - correspondenceAB$Redundancy[which(apply(correspondenceAB[, c(1, k + 2)], - 1, paste, collapse = " ") %in% apply(f1, 1, paste, collapse = " "))] <- 1 - - }, error = function(e) { - stop(simpleError(paste("An error has occurred and execution needs to stop. Please check the input data. \n Details line 1734:\n",e))) - }) - - message("\n") - - # Check the number of the unmatched codes. - if (length(which(as.vector(correspondenceAB$Unmatched) == 1))/nrow(correspondenceAB) < - MismatchTolerance) { - - tryCatch({ - - # The following if statement is applied if there are any - # supplementary information for the classification A, in order to - # be adjusted next to the correspondence table A:B. - if (ncol(RRR[[1]]) >= 2) { - A1 <- RRR[[1]][match(correspondenceAB[, 1], unlist(RRR[[1]][, 1])), - 2:ncol(RRR[[1]])] - A1[is.na(A1)] <- "" - A1 <- matrix(unlist(A1), ncol = length(2:ncol(RRR[[1]]))) - colnames(A1) <- paste(paste(colnames(RRR[[1]])[1]), colnames(RRR[[1]])[2:ncol(RRR[[1]])], - sep = "_") - correspondenceAB <- cbind(correspondenceAB, A1) - } - - # The following for loop is applied for the classfications C1, C2, - # ..., Ck. - for (i1 in c(2:(((length(RRR) + 1)/2) - 1))) { - - # The if statement is applied if there are any supplementary - # information for the classfications C1, C2, ..., Ck, in order - # to be adjusted next to the correspondence table A:B. - if (ncol(RRR[[i1]]) >= 2) { - A1 <- RRR[[i1]][match(correspondenceAB[, i1], unlist(RRR[[i1]][, - 1])), 2:ncol(RRR[[i1]])] - A1[is.na(A1)] <- "" - A1 <- matrix(unlist(A1), ncol = length(2:ncol(RRR[[i1]]))) - colnames(A1) <- paste(paste(colnames(RRR[[i1]])[1]), colnames(RRR[[i1]])[2:ncol(RRR[[i1]])], - sep = "_") - correspondenceAB <- cbind(correspondenceAB, A1) - } - - } - - # The following if statement is applied if there are any - # supplementary information for the classification B, in order to - # be adjusted next to the correspondence table A:B. - if (ncol(RRR[[(length(RRR) + 1)/2]]) >= 2) { - A1 <- RRR[[(length(RRR) + 1)/2]][match(correspondenceAB[, (length(RRR) + - 1)/2], unlist(RRR[[(length(RRR) + 1)/2]][, 1])), 2:ncol(RRR[[(length(RRR) + - 1)/2]])] - A1[is.na(A1)] <- "" - A1 <- matrix(unlist(A1), ncol = length(2:ncol(RRR[[(length(RRR) + - 1)/2]]))) - colnames(A1) <- paste(paste(colnames(RRR[[k + 2]])[1]), colnames(RRR[[(length(RRR) + - 1)/2]])[2:ncol(RRR[[(length(RRR) + 1)/2]])], sep = "_") - correspondenceAB <- cbind(correspondenceAB, A1) - } - - # Find which .csv files are the correspondence tables. - Tail <- utils::tail(c(1:length(RRR)), (length(RRR) - 1)/2) - - # The following if statement is applied if there are any - # supplementary information for the correspondence table A:C1, in - # order to be adjusted next to the correspondence table A:B. - if (ncol(RRR[[Tail[1]]]) >= 3) { - A1 <- RRR[[Tail[1]]][match(data.frame(t(correspondenceAB[, 1:2])), - data.frame(t(RRR[[Tail[1]]][, 1:2]))), 3:ncol(RRR[[Tail[1]]])] - A1[is.na(A1)] <- "" - A1 <- matrix(unlist(A1), ncol = length(3:ncol(RRR[[Tail[1]]]))) - colnames(A1) <- paste(paste(colnames(RRR[[Tail[1]]])[1]), colnames(RRR[[Tail[1]]])[3:ncol(RRR[[Tail[1]]])], - sep = "_") - correspondenceAB <- cbind(correspondenceAB, A1) - } - - # The following if statement is applied if there are any - # supplementary information for the correspondence tables (C1:C2 - - # C2:C3), (C2:C3 - C3:C4), ..., (C(k-2):C(k-1) - C(k-1):Ck), in - # order to be adjusted next to the correspondence table A:B. - if (length(Tail) >= 3) { - for (i2 in 2:(length(Tail) - 1)) { - if (ncol(RRR[[Tail[i2]]]) >= 3) { - A1 <- RRR[[Tail[i2]]][match(data.frame(t(correspondenceAB[, c(i2, - i2 + 1)])), data.frame(t(RRR[[Tail[i2]]][, 1:2]))), 3:ncol(RRR[[Tail[i2]]])] - A1[is.na(A1)] <- "" - A1 <- matrix(unlist(A1), ncol = length(3:ncol(RRR[[Tail[i2]]]))) - colnames(A1) <- paste(paste(colnames(RRR[[Tail[i2]]])[1]), colnames(RRR[[Tail[i2]]])[3:ncol(RRR[[Tail[i2]]])], - sep = "_") - correspondenceAB <- cbind(correspondenceAB, A1) - } - } - } - - # The following if statement is applied if there are any - # supplementary information for the correspondence table B:Ck, in - # order to be adjusted next to the correspondence table A:B. - if (ncol(RRR[[Tail[length(Tail)]]]) >= 3) { - A1 <- RRR[[Tail[length(Tail)]]][match(data.frame(t(correspondenceAB[, - c(((length(RRR) + 1)/2) - 1, (length(RRR) + 1)/2)])), data.frame(t(RRR[[Tail[length(Tail)]]][, - c(2, 1)]))), 3:ncol(RRR[[Tail[length(Tail)]]])] - A1[is.na(A1)] <- "" - A1 <- matrix(unlist(A1), ncol = length(3:ncol(RRR[[Tail[length(Tail)]]]))) - colnames(A1) <- paste(paste(colnames(RRR[[Tail[length(Tail)]]])[1]), - colnames(RRR[[Tail[length(Tail)]]])[3:ncol(RRR[[Tail[length(Tail)]]])], - sep = "_") - correspondenceAB <- cbind(correspondenceAB, A1) - } - }, error = function(e) { - stop(simpleError(paste("An error has occurred and execution needs to stop. Please check the input data. \n Details line 1841: \n",e))) - }) - - } else { - # Error message in case the percentage of unmatched codes between A and - # B is larger than the desired threshold. - stop("Too many codes in either of classifications A and B cannot be mapped to any code in the other one.\n", - round(length(which(as.vector(correspondenceAB$Unmatched) == 1))/nrow(correspondenceAB)*100,2),"% is unmatched which exceeds the mismatch tolerance of ", MismatchTolerance) - } - - - tryCatch({ - - # The final correspondence table A:B is sorted, firstly, based on - # classification A, and then, based on classification B. - correspondenceAB <- correspondenceAB[order(correspondenceAB[, 1], correspondenceAB[, - (length(RRR) + 1)/2], decreasing = FALSE), ] - - - # Redundancy_trim parameter (MP) - # Find the columns which are related to linking datasets which values need to be recorded as "Multiple" - ## 2 + n_link_data*2 + 1 = n_data - num_link = (length(test.names) - 3)/2 - col_multiple = numeric(0) - for (nl in 1:num_link){ - col_multiple = unique(c(col_multiple, grep(colnames(correspondenceAB)[1 + nl], colnames(correspondenceAB), value = T))) - } - max_col = num_link + 2 - - #Do redundancy trim only if redundancy is there (MP) - if (length(which(correspondenceAB$Redundancy == 1)) != 0){ - - # Find unique combination of A and B and identify them with a number - uniqueAB = unique(correspondenceAB[which(correspondenceAB$Redundancy == 1),c(1, max_col)]) - uniqueAB$id_to_use = 1:nrow(uniqueAB) - - correspondenceAB = merge(correspondenceAB, uniqueAB, by = colnames(correspondenceAB)[c(1,max_col)], all.x = TRUE)[, union(names(correspondenceAB), names(uniqueAB))] - col_link = grep("id_to_use", colnames(correspondenceAB), value = T) - - ### new but probably slower - if (Redundancy_trim == TRUE){ - x_temp = split(correspondenceAB[which(correspondenceAB$Redundancy == 1), col_multiple], correspondenceAB[which(correspondenceAB$Redundancy == 1), col_link]) - - for (i in 1:nrow(uniqueAB)){ - multiple_values = apply(x_temp[[i]], 2, function(x) length(unique(x))) - x_change = which(multiple_values != 1) - #replace with multiple - col_multiple_temp = c(col_multiple)[x_change] - correspondenceAB[which(correspondenceAB$Redundancy == 1 & correspondenceAB[,col_link] == i), unique(col_multiple_temp)] = "Multiple" - } - - correspondenceAB = correspondenceAB[,!names(correspondenceAB) %in% "id_to_use"] - - ### old but probably faster - ##replace with multiple - ##replace with multiple - #correspondenceAB[which(correspondenceAB$Redundancy == 1), unique(col_multiple)] = "Multiple" - - #eliminate duplicates - dup = as.numeric(duplicated(correspondenceAB[,1:max_col])) - correspondenceAB = correspondenceAB[which(dup == 0), ] - - } - } else { - - correspondenceAB = correspondenceAB - } - - if (Redundancy_trim==FALSE){ - #add a redundancy keep flag to indicate which row will be kept - dup = as.numeric(duplicated(correspondenceAB[,c(1,max_col)])) - red_col = which(colnames(correspondenceAB) == "Redundancy") - correspondenceAB$Redundancy_keep = rep(0, nrow(correspondenceAB)) - correspondenceAB$Redundancy_keep[which(dup == "0" & correspondenceAB$Redundancy == "1")] = 1 - correspondenceAB = correspondenceAB[,c(1:red_col, ncol(correspondenceAB), (red_col+1):(ncol(correspondenceAB)-1))] - correspondenceAB = correspondenceAB[,!names(correspondenceAB) %in% "id_to_use"] - } - - - # Create a data frame that contains the names of the classifications. - CsvNames <- data.frame(matrix(0, k + 2, 1)) - - CsvNames[1, 1] <- paste("A:", colnames(correspondenceAB)[1], sep = " ") - - CsvNames[k + 2, 1] <- paste("B:", colnames(correspondenceAB)[k + 2], sep = " ") - - for (i3 in seq(k) + 1) { - CsvNames[i3, 1] <- paste(paste("C", i3 - 1, ":", sep = ""), colnames(correspondenceAB)[i3], - sep = " ") - } - - CsvNames <- data.frame(CsvNames) - - ##Added condition when CSVout is null (MP) - if (!is.null(CSVout)) { - - pos <- regexpr("\\/[^\\/]*$", CSVout) - Name1 <- substr(CSVout, 1, pos[[1]]) - Name2 <- substr(CSVout, pos[[1]] + 1, nchar(CSVout)) - - pos <- regexpr("\\.[^\\.]*$", Name2) - if (pos[[1]] == -1) { - Name <- substr(Name2, pos[[1]] + 1, nchar(Name2)) - } else { - Name <- substr(Name2, 1, pos[[1]] - 1) - } - } - - colnames(CsvNames) <- paste("Classification:", "Name", sep = " ") - - # Create a data frame that contains the final correspondence table - # (final desired table). - Final <- apply(correspondenceAB, 2, function(x) { - gsub(" ", " ", x) - }) - - - if (is.null(dim(Final))) { - Final <- t(data.frame(Final)) - rownames(Final) <- 1 - } - - - }, error = function(e) { - stop(simpleError(paste("An error has occurred and execution needs to stop. Please check the input data. \n Deatils line 1895:\n",e))) - }) - - # Check so as to write (or not) the final correspondence table (final - # desired table) as well as the names of classifications in two seperate - # csv files. - tryCatch({ - - if (!is.null(CSVout)) { - data.table::fwrite(data.frame(Final, check.names = FALSE), file = CSVout, quote = TRUE) - utils::write.csv(CsvNames, file = paste0(Name1, "classificationNames_", Name2), - row.names = FALSE) - } - - }, error = function(e) { - stop(simpleError("An error occurred while trying to write the output to the specified files. Please check the respective input parameters.")) - }) - - # The final list that contains the final correspondence table (final - # desired table) as a data frame as well as the names of classifications as - # a data frame. - tryCatch({ - - FinalResults <- list() - FinalResults[[1]] <- data.frame(Final, check.names = FALSE, row.names = NULL) - FinalResults[[2]] <- CsvNames - names(FinalResults) <- c("newCorrespondenceTable", "classificationNames") - - # newCorrespondenceTable function returns the final correspondence - # table A:B, that contains the pivot classifications C1, C2, ..., Ck, - # as well as any supplementary information about the classification - # tables A, C1, C2, ..., Ck, B, and the correspondence tables A:C1, - # (C1:C2 - C2:C3), (C2:C3 - C3:C4), ..., (C(k-2):C(k-1) - C(k-1):Ck), - # B:Ck. - - return(FinalResults) - }, error = function(e) { - stop(simpleError(paste("An error has occurred and execution needs to stop. Please check the input data. \n Details line 1946:\n",e))) - }) - -} - +#' @title Ex novo creation of candidate correspondence tables between two classifications via pivot tables +#' @description Creation of a candidate correspondence table between two classifications, A and B, when there are +#' correspondence tables leading from the first classification to the second one via \eqn{k} intermediate pivot +#' classifications \eqn{C_1, \ldots, C_k}. +#' The correspondence tables leading from A to B are A:\eqn{C_1}, \{\eqn{C_i}:\eqn{C_{i+1}}: \eqn{1 \le i \le k -1}\}, B:\eqn{C_k}. +#' @param Tables A string of type character containing the name of a csv file which contains the names of the files that +#' contain the classifications and the intermediate correspondence tables (see "Details" below). +#' @param CSVout The preferred name for the \emph{output csv files} that will contain the candidate correspondence table +#' and information about the classifications involved. The valid values are \code{NULL} or strings of type \code{character}. +#' If the selected value is \code{NULL}, the default, no output file is produced. If the value is a string, then the output +#' is exported into two csv files whose names contain the provided name (see "Value" below). +#' @param Reference The reference classification among A and B. If a classification is the reference to the other, and hence +#' \emph{hierarchically superior} to it, each code of the other classification is expected to be mapped to at most one code +#' of the reference classification. The valid values are \code{"none"}, \code{"A"}, and \code{"B"}. If the selected value +#' is \code{"A"} or \code{"B"}, a "Review" flag column (indicating the records violating this expectation) is included +#' in the output (see "Explanation of the flags" below). +#' @param MismatchTolerance The maximum acceptable proportion of rows in the candidate correspondence table which contain +#' no code for classification A or no code for classification B. The default value is \code{0.2}. The valid values are +#' real numbers in the interval [0, 1]. +#' @param Redundancy_trim An argument in the function containing the logical values \code{TRUE} or \code{FALSE} +#' used to facilitate the trimming of the redundant records. +#' The default value is \code{TRUE}, which removes all redundant records. +#' The other values is \code{FALSE}, which shows redundant records together with the redundancy flag. +#' @export +#' @details +#' File and file name requirements: +#' \itemize{ +#' \item The file that corresponds to argument \code{Tables} and the files to which the contents of \code{Tables} +#' lead, must be in \emph{csv format with comma as delimiter}. If full paths are not provided, then these files must +#' be available in the working directory. No two filenames provided must be identical. +#' \item The file that corresponds to argument \code{Tables} must contain filenames, \emph{and nothing else}, in +#' a \eqn{(k+2)} × \eqn{(k+2)} table, where \eqn{k}, a positive integer, is the number of "pivot" classifications. +#' The cells in the main diagonal of the table provide the filenames of the files which contain, with this order, +#' the classifications A, \eqn{C_1}, \eqn{\ldots}, \eqn{C_k} and B. The off-diagonal directly above the main +#' diagonal contains the filenames of the files that contain, with this order, the correspondence tables +#' A:\eqn{C_1}, \{\eqn{C_i}:\eqn{C_{i+1}}, \eqn{1 \le i \le k-1}\} and B:\eqn{C_k}. All other cells of the table +#' must be empty. +#' \item If any of the two files where the output will be stored is read protected (for instance because it is open +#' elsewhere) an error message will be reported and execution will be halted. +#' } +#' Classification table requirements: +#' \itemize{ +#' \item Each of the files that contain classifications must contain at least one column and at least two rows. +#' The first column contains the codes of the respective classification. The first row contains column headers. +#' The header of the first column is the name of the respective classification (e.g., "CN 2021"). +#' \item The classification codes contained in a classification file (expected in its first column as mentioned +#' above) must be unique. No two identical codes are allowed in the column. +#' \item If any of the files that contain classifications has additional columns the first one of them is assumed +#' to contain the labels of the respective classification codes. +#' } +#' Correspondence table requirements: +#' \itemize{ +#' \item The files that contain correspondence tables must contain at least two columns and at least two rows. +#' The first column of the file that contains A:\eqn{C_1} contains the codes of classification A. The second column +#' contains the codes of classification \eqn{C_1}. Similar requirements apply to the files that contain +#' \eqn{C_i}:\eqn{C_{i+1}}, \eqn{1 \le i \le k-1} and B:\eqn{C_k}. The first row of each of the files that contain +#' correspondence tables contains column headers. The names of the first two columns are the names of the respective +#' classifications. +#' \item The pairs of classification codes contained in a correspondence table file (expected in its first two columns +#' as mentioned above) must be unique. No two identical pairs of codes are allowed in the first two columns. +#' } +#' Interdependency requirements: +#' \itemize{ +#' \item At least one code of classification A must appear in both the file of classification A and the file of +#' correspondence table A:\eqn{C_1}. +#' \item At least one code of classification B must appear in both the file of classification B and the file of +#' correspondence table B:\eqn{C_k}, where \eqn{k}, \eqn{k\ge 1}, is the number of pivot classifications. +#' \item If there is only one pivot classification, \eqn{C_1}, at least one code of it must appear in both the file of +#' correspondence table A:\eqn{C_1} and the file of correspondence table B:\eqn{C_1}. +#' \item If the pivot classifications are \eqn{k} with \eqn{k\ge 2} then at least one code of \eqn{C_1} must appear in +#' both the file of correspondence table A:\eqn{C_1} and the file of correspondence table \eqn{C_1}:\eqn{C_2}, at least one +#' code of each of the \eqn{C_i}, \eqn{i = 2, \ldots, k-1} (if \eqn{k\ge 3}) must appear in both the file of correspondence +#' table \eqn{C_{i-1}}:\eqn{C_i} and the file of correspondence table \eqn{C_i}:\eqn{C_{i+1}}, and at least one code of +#' \eqn{C_k} must appear in both the file of correspondence table \eqn{C_{k-1}}:\eqn{C_k} and the file of correspondence table +#' B:\eqn{C_k}. +#' } +#' Mismatch tolerance: +#' \itemize{ +#' \item The ratio that is compared with \code{MismatchTolerance} has as numerator the number of rows in the candidate +#' correspondence table which contain no code for classification A or no code for classification B and as denominator +#' the total number of rows of this table. If the ratio exceeds \code{MismatchTolerance} the execution of the function +#' is halted. +#' } +#' If any of the conditions required from the arguments is violated an error message is produced and execution is stopped. +#' +#' @section Explanation of the flags: +#' +#' \itemize{ +#' \item The "Review" flag is produced only if argument Reference has been set equal to "\code{A}" or "\code{B}". For +#' each row of the candidate correspondence table, if \code{Reference} = "\code{A}" the value of "Review" is equal to +#' \code{1} if the code of B maps to more than one code of A, and \code{0} otherwise. If \code{Reference} = "\code{B}" +#' the value of "Review" is equal to \code{1} if the code of A maps to more than one code of B, and \code{0} otherwise. +#' The value of the flag is empty if the row does not contain a code of A or a code of B. +#' \item For each row of the candidate correspondence table, the value of "Redundancy" is equal to \code{1} if the row +#' contains a combination of codes of A and B that also appears in at least one other row of the candidate +#' correspondence table. +#' \item When "Redundancy_Trim" is equal to \code{FALSE} the "Redundancy_keep" flag is created to identify with value \code{1} +#' the records that will be kept if trimming is performed. +#' \item For each row of the candidate correspondence table, the value of "Unmatched" is equal to \code{1} if the row +#' contains a code of A but no code of B or if it contains a code of B but no code of A. The value of the flag is +#' \code{0} if the row contains codes for both A and B. +#' \item For each row of the candidate correspondence table, the value of "NoMatchFromA" is equal to \code{1} if the row +#' contains a code of A that appears in the table of classification A but not in correspondence table A:\eqn{C_1}. The +#' value of the flag is \code{0} if the row contains a code of A that appears in both the table of classification A and +#' correspondencetable A:\eqn{C_1}. Finally, the value of the flag is empty if the row contains no code of A or if it +#' contains a code of A that appears in correspondence table A:\eqn{C_1} but not in the table of classification A. +#' \item For each row of the candidate correspondence table, the value of "NoMatchFromB" is equal to \code{1} if the row +#' contains a code of B that appears in the table of classification B but not in correspondence table B:\eqn{C_k}. The +#' value of the flag is \code{0} if the row contains a code of B that appears in both the table of classification B and +#' correspondence table B:\eqn{C_k}. Finally, the value of the flag is empty if the row contains no code of B or if it +#' contains a code of B that appears in correspondence table B:\eqn{C_k} but not in the table of classification B. +#' \item The argument "Redundancy_trim" is used to delete all the redundancies which are mapping correctly. +#' The valid logical values for this argument in the candidate correspondence table are \code{TRUE} or \code{FALSE}. +#' If the selected value is \code{TRUE}, all redundant records are removed and kept exactly one record for each unique combination. +#' For this retained record, the codes, the label and the supplementary information of the pivot classifications are replaced with +#' 'multiple'. If the multiple infomration of the pivot classifications are the same, their value will not be replaced. +#' If the selected value is \code{FALSE}, no trimming is executed so redundant records are shown, together with the redundancy flag. +#' If the logical values are missing the implementation of the function will stop. +#' +#' } +#' +#' @section Sample datasets included in the package: +#' +#' Running \code{browseVignettes("correspondenceTables")} in the console opens an html page in the user's default browser. Selecting HTML from the menu, users can read information about the use of the sample datasets that are included in the package. +#' If they wish to access the csv files with the sample data, users have two options: +#' \itemize{ +#' \item Option 1: Unpack into any folder of their choice the tar.gz file into which the package has arrived. All sample +#' datasets may be found in the "inst/extdata" subfolder of this folder. +#' \item Option 2: Go to the "extdata" subfolder of the folder in which the package has been installed in their PC's \code{R} +#' library. All sample datasets may be found there. +#' } +#' +#' @return +#' \code{newCorrespondenceTable()} returns a list with two elements, both of which are data frames. +#' \itemize{ +#' \item The first element is the candidate correspondence table A:B, including the codes of all "pivot" classifications, +#' augmented with flags "Review" (if applicable), "Redundancy", "Unmatched", "NoMatchFromA", "NoMatchFromB" and with all +#' the additional columns of the classification and intermediate correspondence table files. +#' \item The second element contains the names of classification A, the "pivot" classifications and classification B as +#' read from the top left-hand side cell of the respective input files. +#' \item If the value of argument \code{CSVout} a string of type \code{character}, the elements of the list are exported +#' into files of csv format. The name of the file for the first element is the value of argument \code{CSVout} and the +#' name of the file for the second element is classificationNames_\code{CSVout}. For example, if +#' \code{CSVout} = "newCorrespondenceTable.csv", the elements of the list are exported into "newCorrespondenceTable.csv" +#' and "classificationNames_newCorrespondenceTable.csv" respectively. +#' } +#' +#' @examples +#' { +#' ## Application of function newCorrespondenceTable() with "example.csv" being the file +#' ## that includes the names the files and the intermediate tables in a sparse square +#' ## matrix containing the 100 rows of the classifications (from ISIC v4 to CPA v2.1 through +#' ## CPC v2.1). The desired name for the csv file that will contain the candidate +#' ## correspondence table is "newCorrespondenceTable.csv", the reference classification is +#' ## ISIC v4 ("A") and the maximum acceptable proportion of unmatched codes between +#' ## ISIC v4 and CPC v2.1 is 0.56 (this is the minimum mismatch tolerance for the first 100 row +#' ## as 55.5% of the code of ISIC v4 is unmatched). +#' +#' tmp_dir<-tempdir() +#' A <- read.csv(system.file("extdata", "example.csv", package = "correspondenceTables"), +#' header = FALSE, +#' sep = ",") +#' for (i in 1:nrow(A)) { +#' for (j in 1:ncol(A)) { +#' if (A[i,j]!="") { +#' A[i, j] <- system.file("extdata", A[i, j], package = "correspondenceTables") +#' }}} +#' write.table(x = A, +#' file = file.path(tmp_dir,"example.csv"), +#' row.names = FALSE, +#' col.names = FALSE, +#' sep = ",") +#' +#' NCT<-newCorrespondenceTable(file.path(tmp_dir,"example.csv"), +#' file.path(tmp_dir,"newCorrespondenceTable.csv"), +#' "A", +#' 0.56, +#' FALSE) +#' +#' summary(NCT) +#' head(NCT$newCorrespondenceTable) +#' NCT$classificationNames +#' csv_files<-list.files(tmp_dir, pattern = ".csv") +#' unlink(csv_files) +#' } + +newCorrespondenceTable <- function(Tables, CSVout = NULL, Reference = "none", MismatchTolerance = 0.2, Redundancy_trim = TRUE) { + + # Check if the file that contains the names of both classifications and + # correspondence tables exists in working directory + if (!file.exists(Tables)) { + stop(simpleError(paste("There is no file with name", Tables, "in your working directory."))) + } else { + # x <- as.matrix(utils::read.csv(Tables, sep = ",", header = FALSE, colClasses = c("character"), + # encoding = "UTF-8")) + x <- as.matrix(data.table::fread(Tables, sep = ",", header = FALSE, colClasses = c("character"), + encoding = "UTF-8")) + mat.list <- apply(x, 2, function(x) { + as.character(which(x != "")) + }) + } + + # Check if files exist in working directory + test.names <- as.vector(x)[which(as.vector(x) != "")] + if (!all(file.exists(test.names))) { + for (i in which(file.exists(test.names) == FALSE)) { + stop(simpleError(paste("The is no file with name", test.names[i], "in your working directory."))) + } + } + + if (length(which(duplicated(test.names) == TRUE)) >= 1) { + stop(simpleError(paste("At least two of the filenames in", Tables, "are the same."))) + } + + # Check CSVout + if (!is.null(CSVout)) { + while (file.exists(CSVout)) { + message(paste("Your working directory contains already a file with the name that you selected for the output file: ", + CSVout)) + answer <- utils::menu(c("Yes", "No"), title = "Do you want to overwrite it?") + if (answer == 2) { + CSVout <- readline(prompt = "Please enter a new name for the output file: ") + } + if (answer == 1) { + break + } + } + } + + test.list <- list() + test.list[[1]] <- "1" + for (mat.index in 2:ncol(x)) { + test.list[[mat.index]] <- as.character(c((mat.index - 1):mat.index)) + } + + # The following if statement checks if the names of both classifications + # and correspondence tables in the 'names.csv' file construct a sparse + # square matrix. + if (all(unlist(Map(identical, mat.list, test.list)) == TRUE) && nrow(x) >= 3) { + k <- nrow(x) - 2 + + } else { + # Error message in case the names of both classifications and + # correspondence tables in the 'names.csv' file do not construct a + # sparse square matrix. + stop(paste("The filenames in", Tables, "do not construct a sparse square matrix. \n Please verify that the appropriate number of filenames are inserted in the appropriate cells.")) + } + + # The list inputs includes the names of both classifications and + # correspondence tables. + inputs <- list() + inputs[[1]] <- diag(x)[1] + inputs[seq(k) + 1] = as.list(diag(x)[seq(k) + 1]) + inputs[[k + 2]] <- diag(x)[length(diag(x))] + inputs[(k + 3):(k + 2 + length(as.list(x[upper.tri(x)][x[upper.tri(x)] != ""])))] <- as.list(x[upper.tri(x)][x[upper.tri(x)] != + ""]) + + # Create a list of the classifications and the known correspondence tables + # as data frames. + RRR <- lapply(inputs[1:length(inputs)], function(x) { + utils::read.csv(x, sep = ",", check.names = FALSE, colClasses = c("character"), + encoding = "UTF-8") + # data.table::fread(x, sep = ",", check.names = FALSE, colClasses = c("character"), + # encoding = "UTF-8") + }) + + # Check Reference + if (!(Reference %in% c("A", "B", "none"))) { + stop(simpleError("You entered a non-allowed value for Reference. The allowed values are \"A\", \"B\" and \"none\".")) + } + + # Check MismatchTolerance + if (is.character(MismatchTolerance) || MismatchTolerance < 0 || MismatchTolerance > + 1) { + stop(simpleError("You entered a non-allowed value for MismatchTolerance. The allowed values are numbers in the interval [0, 1].")) + } + + + + removeBOM <- function(headers) { + gsub("\\xef\\xbb\\xbf", "", headers, useBytes = T) + } + + for (i in 1:length(RRR)) { + colnames(RRR[[i]]) <- removeBOM(colnames(RRR[[i]])) + } + + # Convert data frames into matrices. + RR <- lapply(RRR, function(x) { + matrix(unlist(x), ncol = ncol(x)) + }) + + # Select the correspondence tables. + R <- RR[utils::tail(c(1:length(RR)), (length(RR) - 1)/2)] + + # Check the dimensions of the files + for (i in 1:nrow(x)) { + if (ncol(RRR[[i]]) < 1 || nrow(RRR[[i]]) < 1) { + stop(simpleError(paste("File", inputs[i], "should have at least one column and two rows (including the row of headers)."))) + } + } + + for (i in 1:length(R)) { + if (ncol(R[[i]]) <= 1 || nrow(R[[i]]) < 1) { + stop(simpleError(paste("File", inputs[i + nrow(x)], "should have at least two columns and two rows (including the row of headers)."))) + } + } + + # Check for entries dimensions of the files + for (i in 1:nrow(x)) { + if (sum(duplicated(RRR[[i]][, 1])) >= 1) { + stop(simpleError(paste("At least one code of ", colnames(RRR[[i]])[1], + " appears more than once in file ", inputs[i], ". This is an error. Each code must appear only once in the file.", + sep = ""))) + } + } + + for (i in 1:length(R)) { + if (nrow(unique(R[[i]][, 1:2])) != nrow(R[[i]][, 1:2])) { + stop(simpleError(paste("At least one pair of codes of ", colnames(RRR[[i + + nrow(x)]])[1], " and ", colnames(RRR[[i + nrow(x)]])[2], " appears more than once in file ", + inputs[i + nrow(x)], ". This is an error. Each pair of codes must appear only once in the file.", + sep = ""))) + } + } + + # Check for at least one match in classifications and correspondence + # tables. In inputs there are the names of both classifications and + # correspondence tables. Stop with error + if (k == 1) { + # A in A appears in A:C1 + if (sum(!is.na(match(unlist(RRR[[1]][, 1]), R[[1]][, 1]))) == 0) { + stop(simpleError(paste("There is no code of ", colnames(RRR[[1]])[1], + " that appears in both ", inputs[1], " and ", inputs[1 + nrow(x)], + ". This is an error. The files should have at least one code of ", + colnames(RRR[[1]])[1], " in common to allow the generation of the candidate correspondence table.", + sep = ""))) + } + + # C1 in A:C1 appears in B:C1 + if (sum(!is.na(match(R[[1]][, 2], R[[2]][, 2]))) == 0) { + stop(simpleError(paste("There is no code of ", colnames(RRR[[1 + nrow(x)]])[2], + " that appears in both ", inputs[1 + nrow(x)], " and ", inputs[2 + + nrow(x)], ". This is an error. The files should have at least one code of ", + colnames(RRR[[1 + nrow(x)]])[2], " in common to allow the generation of the candidate correspondence table.", + sep = ""))) + } + + # B in B:C1 appears in B + if (sum(!is.na(match(R[[length(R)]][, 1], unlist(RRR[[nrow(x)]][, 1])))) == 0) { + stop(simpleError(paste("There is no code of ", colnames(RRR[[length(R) + + nrow(x)]])[1], " that appears in both ", inputs[nrow(x)], " and ", + inputs[length(R) + nrow(x)], ". This is an error. The files should have at least one code of ", + colnames(RRR[[length(R) + nrow(x)]])[1], " in common to allow the generation of the candidate correspondence table.", + sep = ""))) + } + + } + + if (k >= 2) { + + # A in A appears in A:C1 + if (sum(!is.na(match(unlist(RRR[[1]][, 1]), R[[1]][, 1]))) == 0) { + stop(simpleError(paste("There is no code of ", colnames(RRR[[1]])[1], + " that appears in both ", inputs[1], " and ", inputs[1 + nrow(x)], + ". This is an error. The files should have at least one code of ", + colnames(RRR[[1]])[1], " in common to allow the generation of the candidate correspondence table.", + sep = ""))) + } + + # C1 in A:C1 appears in C1:C2 C2 in C1:C2 appears in C2:C3 ... + for (i in 1:(k - 1)) { + + if (sum(!is.na(match(R[[i]][, 2], R[[i + 1]][, 1]))) == 0) { + stop(simpleError(paste("There is no code of ", colnames(RRR[[i + + nrow(x)]])[2], " that appears in both ", inputs[i + nrow(x)], " and ", + inputs[i + 1 + nrow(x)], ". This is an error. The files should have at least one code of ", + colnames(RRR[[i + nrow(x)]])[2], " in common to allow the generation of the candidate correspondence table.", + sep = ""))) + } + + } + + # Ck in C(k-1):Ck appears in B:Ck + if (sum(!is.na(match(R[[k]][, 2], R[[k + 1]][, 2]))) == 0) { + stop(simpleError(paste("There is no code of ", colnames(RRR[[k + nrow(x)]])[2], + " that appears in both ", inputs[k + nrow(x)], " and ", inputs[k + + 1 + nrow(x)], ". This is an error. The files should have at least one code of ", + colnames(RRR[[k + nrow(x)]])[2], " in common to allow the generation of the candidate correspondence table.", + sep = ""))) + } + + # B in B:Ck appears in B + if (sum(!is.na(match(R[[length(R)]][, 1], unlist(RRR[[nrow(x)]][, 1])))) == 0) { + stop(simpleError(paste("There is no code of ", colnames(RRR[[length(R) + + nrow(x)]])[1], " that appears in both ", inputs[nrow(x)], " and ", + inputs[length(R) + nrow(x)], ". This is an error. The files should have at least one code of ", + colnames(RRR[[length(R) + nrow(x)]])[1], " in common to allow the generation of the candidate correspondence table.", + sep = ""))) + } + + } + + # Warning + if (k == 1) { + + # C1 in C1 appears in A:C1 + if (sum(!is.na(match(unlist(RRR[[2]][, 1]), R[[1]][, 2]))) == 0) { + message(paste("WARNING: there is no code of ", colnames(RRR[[2]])[1], " that appears in both ", + inputs[2], " and ", inputs[1 + nrow(x)], ". When the execution of the function is over, please check the files to ensure that this is not the result of a mistake in their preparation or declaration.\n", + sep = "")) + } + + # C1 in C1 appears in B:C1 + if (sum(!is.na(match(unlist(RRR[[2]][, 1]), R[[2]][, 2]))) == 0) { + message(paste("WARNING: there is no code of ", colnames(RRR[[2]])[1], " that appears in both ", + inputs[2], " and ", inputs[2 + nrow(x)], ". When the execution of the function is over, please check the files to ensure that this is not the result of a mistake in their preparation or declaration.\n", + sep = "")) + } + + } + + if (k == 2) { + + for (i in 2:k) { + + # C1 in C1 appears in A:C1 C2 in C2 appears in C1:C2 C3 in C3 + # appears in C2:C3 + if (sum(!is.na(match(unlist(RRR[[i]][, 1]), R[[i - 1]][, 2]))) == 0) { + message(paste("WARNING: there is no code of ", colnames(RRR[[i]])[1], + " that appears in both ", inputs[i], " and ", inputs[i - 1 + nrow(x)], + ". When the execution of the function is over, please check the files to ensure that this is not the result of a mistake in their preparation or declaration.\n", + sep = "")) + } + + # C1 in C1 appears in C1:C2 C2 in C2 appears in C2:C3 C3 in C3 + # appears in C3:C4 + if (sum(!is.na(match(unlist(RRR[[i]][, 1]), R[[i]][, 1]))) == 0) { + message(paste("WARNING: there is no code of ", colnames(RRR[[i]])[1], + " that appears in both ", inputs[i], " and ", inputs[i + nrow(x)], + ". When the execution of the function is over, please check the files to ensure that this is not the result of a mistake in their preparation or declaration.\n", + sep = "")) + } + } + + # Ck in Ck appears in C(k-1):Ck + if (sum(!is.na(match(unlist(RRR[[k + 1]][, 1]), R[[k]][, 2]))) == 0) { + message(paste("WARNING: there is no code of ", colnames(RRR[[k + 1]])[1], + " that appears in both ", inputs[k + 1], " and ", inputs[k + nrow(x)], + ". When the execution of the function is over, please check the files to ensure that this is not the result of a mistake in their preparation or declaration.\n", + sep = "")) + } + + # Ck in Ck appears in B:Ck + if (sum(!is.na(match(unlist(RRR[[k + 1]][, 1]), R[[k + 1]][, 2]))) == 0) { + message(paste("WARNING: there is no code of ", colnames(RRR[[k + 1]])[1], + " that appears in both ", inputs[k + 1], " and ", inputs[k + 1 + + nrow(x)], ". When the execution of the function is over, please check the files to ensure that this is not the result of a mistake in their preparation or declaration.\n", + sep = "")) + } + + } + + # Create the final correspondence table moving from the classification A to + # the classification B. + tryCatch({ + + F_AtoB <- list() + + # The following if statement is used when we have only the + # correspondence tables A:C1 and B:C1. + counter <- 0 + if (length(R) == 2) { + #creating a progress bar + message("Percentage of codes of ", colnames(RRR[[1]][1]), " processed:") + pb <- txtProgressBar(min = 0, max = 100, style = 3, width = 50, char = "=") + + # The following for loop creates the desirable correspondence + # table. The operations are conducted for each unique element of + # classification A of the correspondence table A:C1. + for (i in unique(R[[1]][, 1])) { + + # Print the percentage of codes that have been processed. + counter <- counter + 1 + setTxtProgressBar(pb, round(counter/length(unique(R[[1]][, 1])) * 100, digits = 0)) + # Matrix TT contains the rows of correspondence table A:C1 for + # a specific element of classification A. Matrix T contains + # the rows of correspondence table B:C1 that match with the + # specific element of classification A based on classification + # C1. + x1 <- R[[1]][which(R[[1]][, 1] == i), 2] + TT <- matrix(R[[1]][which(R[[1]][, 1] == i), 1:2], ncol = 2) + T <- matrix(R[[2]][!is.na(match(R[[2]][, 2], x1)), 1:2], ncol = 2) + + # Create a list whose each element is a matrix that contains + # all unique rows of matrix T based on the elements of + # classification C1. + t <- match(T[, 2], T[, 2]) + v1 <- sequence(rle(sort(t))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z <- lapply(v1, function(x) { + T[order(t)[x], , drop = FALSE] + }) + + # Create a list whose each element is a matrix that contains + # all unique rows of matrix TT that match with the unique + # elements of the second column of matrix T. + t1 <- match(TT[, 2], T[, 2]) + v1 <- sequence(rle(sort(t1))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z1 <- lapply(v1, function(x) { + TT[order(t1)[x], , drop = FALSE] + }) + + # Keep matrices in Z that exist in Z1 based on their second + # columns (elements of classification C1). + Z <- Z[!is.na(match(lapply(Z, function(x) { + unique(x[, 2]) + }), lapply(Z1, function(x) { + unique(x[, 2]) + })))] + + # ZZ is a matrix that consists of matrices in Z1 expanded by + # their corresponding matrices (based on the elements of + # classification C1). + a <- lapply(Z, function(x) { + 1:nrow(x) + }) + a1 <- lapply(Z1, function(x) { + 1:nrow(x) + }) + aa <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z, Map(expand.grid, a, a1)), function(x) { + matrix(x, ncol = 2) + }) + aa1 <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z1, Map(expand.grid, a1, a)), function(x) { + matrix(x, ncol = 2) + }) + ZZ <- do.call(rbind, Map(cbind, aa1, aa)) + + # The records of A:C1 that do not exist in C1:C2 (in terms of + # the values of classification C1) are adjusted to ZZ which + # consists of records of A:C1 that exist in C1:C2 (in terms of + # the values of classification C1). + t1 <- matrix(TT[is.na(match(TT[, 2], ZZ[, 2])), ], ncol = 2) + ZZ <- rbind(ZZ, cbind(t1, matrix("", nrow = nrow(t1), ncol = 2))) + + F_AtoB[[counter]] <- ZZ + + } + } + + # The following if statement is used when we have only the + # correspondence tables A:C1, C1:C2 and B:C2. + if (length(R) == 3) { + #creating a progress bar + message("Percentage of codes of ", colnames(RRR[[1]][1]), " processed:") + pb <- txtProgressBar(min = 0, max = 100, style = 3, width = 50, char = "=") + + + # The following for loop creates the desirable correspondence + # table. The operations are conducted for each unique element of + # classification A of the correspondence table A:C1. + for (i in unique(R[[1]][, 1])) { + counter <- counter + 1 + setTxtProgressBar(pb, round(counter/length(unique(R[[1]][, 1])) * 100, digits = 0)) + + # Matrix T contains the rows of correspondence table C1:C2 that + # match with the specific element of classification A based on + # classification C1. + x1 <- R[[1]][which(R[[1]][, 1] == i), 2] + T <- matrix(R[[2]][!is.na(match(R[[2]][, 1], x1)), 1:2], ncol = 2) + + # The records of A:C1 that do not exist in C1:C2 (in terms of + # the values of classification C1). + if (length(which(is.na(match(x1, T[, 1])) == TRUE)) > 0) { + M1 <- matrix(matrix(R[[1]][which(R[[1]][, 1] == i), 1:2], ncol = 2)[is.na(match(x1, + T[, 1])), ], ncol = 2) + } else { + M1 = matrix(0, 1, 2 * length(R)) + M1 = M1[FALSE, ] + } + + if (nrow(M1) != 0) { + for (times in 1:(2 * length(R) - ncol(M1))) { + + M1 <- cbind(M1, "") + + } + } + + # Matrix TT contains the rows of correspondence table B:C2 that + # match with the specific element of classification A based on + # classification C1. + x2 <- R[[2]][!is.na(match(R[[2]][, 1], x1)), 2] + T1 <- matrix(R[[3]][!is.na(match(R[[3]][, 2], x2)), 1:2], ncol = 2) + + # The records of C1:C2 that do not exist in B:C2 (in terms of + # the values of classification C2). + if (length(which(is.na(match(x2, T1[, 2])) == TRUE)) > 0) { + if (length(which(is.na(match(x2, T1[, 2])) == TRUE)) == 1) { + M2 <- matrix(c(i, T[is.na(match(x2, T1[, 2])), 1], T[is.na(match(x2, + T1[, 2])), ]), ncol = 4) + } else { + M2 <- cbind(i, T[is.na(match(x2, T1[, 2])), 1], T[is.na(match(x2, + T1[, 2])), ]) + } + } else { + M2 = matrix(0, 1, 2 * length(R)) + M2 = M2[FALSE, ] + } + + if (nrow(M2) != 0) { + for (times in 1:(2 * length(R) - ncol(M2))) { + + M2 <- cbind(M2, "") + + } + } + + # Create a list whose each element is a matrix that contains + # all unique rows of matrix T based on the elements of + # classification C1. + t <- match(T[, 2], T[, 2]) + v1 <- sequence(rle(sort(t))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z <- lapply(v1, function(x) { + T[order(t)[x], , drop = FALSE] + }) + + # Create a list whose each element is a matrix that contains + # all unique rows of matrix TT that match with the unique + # elements of the second column of matrix T. + t1 <- match(T1[, 2], T[, 2]) + v1 <- sequence(rle(sort(t1))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z1 <- lapply(v1, function(x) { + T1[order(t1)[x], , drop = FALSE] + }) + + # Keep matrices in Z that exist in Z1 based on their second + # columns (elements of classification C1). + Z <- Z[!is.na(match(lapply(Z, function(x) { + unique(x[, 2]) + }), lapply(Z1, function(x) { + unique(x[, 2]) + })))] + + # ZZ is a matrix that consists of matrices in Z1 expanded by + # their corresponding matrices (based on the elements of + # classification C1). + a <- lapply(Z, function(x) { + 1:nrow(x) + }) + a1 <- lapply(Z1, function(x) { + 1:nrow(x) + }) + aa <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z, Map(expand.grid, a, a1)), function(x) { + matrix(x, ncol = 2) + }) + aa1 <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z1, Map(expand.grid, a1, a)), function(x) { + matrix(x, ncol = 2) + }) + ZZ <- do.call(rbind, Map(cbind, aa, aa1)) + + # The records of both M1 and M2 are adjusted to ZZ which + # consists of records of A:C1 that exist in C1:C2 (in terms of + # the values of classification C1). + if (is.null(dim(ZZ))) { + F_AtoB[[counter]] <- rbind(M1, M2) + } else { + F_AtoB[[counter]] <- rbind(cbind(i, ZZ[, 1], ZZ), M1, M2) + } + + } + + } + + # The following if statement is used in the general situation, in which + # we have the correspondence tables A:C1, Ci:C(i+1) for i = 1, ..., + # (k-1) Ci and B:Ck. + M <- list() + if (length(R) >= 4) { + message("Percentage of codes of ", colnames(RRR[[1]][1]), " processed:") + pb <- txtProgressBar(min = 0, max = 100, style = 3, width = 50, char = "=") + + + # The following for loop creates the desirable correspondence + # table. The operations are conducted for each unique element of + # classification A of the correspondence table A:C1. + for (i in unique(R[[1]][, 1])) { + + counter <- counter + 1 + setTxtProgressBar(pb, round(counter/length(unique(R[[1]][, 1])) * 100, digits = 0)) + + + for (j in 1:(length(R) - 2)) { + # The same operations as in the case that we have only the + # correspondence tables A:C1 and B:C1, but here for the + # correspondence tables C1:C2 and C2:C3. + if (j == 1) { + + x1 <- R[[j]][which(R[[j]][, 1] == i), 2] + T <- matrix(R[[j + 1]][!is.na(match(R[[j + 1]][, 1], x1)), 1:2], + ncol = 2) + + # The records of A:C1 that do not exist in C1:C2 (in terms + # of the values of classification C1) + + if (length(which(is.na(match(x1, T[, 1])) == TRUE)) > 0) { + M1 <- matrix(matrix(R[[j]][which(R[[j]][, 1] == i), 1:2], ncol = 2)[is.na(match(x1, + T[, 1])), ], ncol = 2) + } else { + M1 = matrix(0, 1, 2 * length(R)) + M1 = M1[FALSE, ] + } + + if (nrow(M1) != 0) { + for (times in 1:(2 * length(R) - ncol(M1))) { + + M1 <- cbind(M1, "") + + } + } + + x2 <- R[[j + 1]][!is.na(match(R[[j + 1]][, 1], x1)), 2] + T1 <- matrix(R[[j + 2]][!is.na(match(R[[j + 2]][, 1], x2)), 1:2], + ncol = 2) + + if (length(which(is.na(match(x2, T1[, 1])) == TRUE)) > 0) { + + if (length(which(is.na(match(x2, T1[, 1])) == TRUE)) == 1) { + M2 <- matrix(c(i, T[is.na(match(x2, T1[, 1])), 1], T[is.na(match(x2, + T1[, 1])), ]), ncol = 4) + + } else { + M2 <- cbind(i, T[is.na(match(x2, T1[, 1])), 1], T[is.na(match(x2, + T1[, 1])), ]) + } + } else { + M2 = matrix(0, 1, 2 * length(R)) + M2 = M2[FALSE, ] + } + + if (nrow(M2) != 0) { + for (times in 1:(2 * length(R) - ncol(M2))) { + + M2 <- cbind(M2, "") + + } + } + + t <- match(T[, 2], T[, 2]) + v1 <- sequence(rle(sort(t))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z <- lapply(v1, function(x) { + T[order(t)[x], , drop = FALSE] + }) + + t1 <- match(T1[, 1], T[, 2]) + v1 <- sequence(rle(sort(t1))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z1 <- lapply(v1, function(x) { + T1[order(t1)[x], , drop = FALSE] + }) + + Z <- Z[!is.na(match(lapply(Z, function(x) { + unique(x[, 2]) + }), lapply(Z1, function(x) { + unique(x[, 1]) + })))] + + a <- lapply(Z, function(x) { + 1:nrow(x) + }) + a1 <- lapply(Z1, function(x) { + 1:nrow(x) + }) + aa <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z, Map(expand.grid, a, a1)), function(x) { + matrix(x, ncol = 2) + }) + aa1 <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z1, Map(expand.grid, a1, a)), function(x) { + matrix(x, ncol = 2) + }) + ZZ <- do.call(rbind, Map(cbind, aa, aa1)) + + } + + # The same operations as in the case that we have only the + # correspondence tables A:C1 and B:C1, but here for the pairs + # of correspondence tables (C2:C3 - C3:C4), (C3:C4 - C4:C5), + # ..., (C(k-2):C(k-1) - C(k-1):Ck). For each value of j that + # satisfies the if statement, the previous matrix ZZ created + # is used. For j = 2, the matrix ZZ created in the previous + # if statement is used. + if (j >= 2 && j <= (length(R) - 3) && length(R) != 4) { + + t <- match(ZZ[, ncol(ZZ)], ZZ[, ncol(ZZ)]) + v1 <- sequence(rle(sort(t))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z <- lapply(v1, function(x) { + ZZ[order(t)[x], , drop = FALSE] + }) + + t1 <- match(R[[j + 2]][, 1], ZZ[, ncol(ZZ)]) + v1 <- sequence(rle(sort(t1))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z1 <- lapply(v1, function(x) { + R[[j + 2]][order(t1)[x], 1:2, drop = FALSE] + }) + + if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])) == + TRUE)) > 0) { + if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])) == + TRUE)) == 1) { + M3 <- matrix(c(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, + 1])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])), + ]), ncol = ncol(ZZ) + 2) + } else { + M3 <- cbind(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, + 1])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])), + ]) + } + } else { + M3 = matrix(0, 1, 2 * length(R)) + M3 = M3[FALSE, ] + } + + if (nrow(M3) != 0) { + for (times in 1:(2 * length(R) - ncol(M3))) { + + M3 <- cbind(M3, "") + + } + } + + M[[j - 1]] <- M3 + + Z <- Z[!is.na(match(lapply(Z, function(x) { + unique(x[, ncol(ZZ)]) + }), lapply(Z1, function(x) { + unique(x[, 1]) + })))] + + a <- lapply(Z, function(x) { + 1:nrow(x) + }) + a1 <- lapply(Z1, function(x) { + 1:nrow(x) + }) + + aa <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z, Map(expand.grid, a, a1)), function(x) { + matrix(x, ncol = ncol(ZZ)) + }) + aa1 <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z1, Map(expand.grid, a1, a)), function(x) { + matrix(x, ncol = 2) + }) + + ZZ <- do.call(rbind, Map(cbind, aa, aa1)) + + } + + # The same operations as in the case that we have only the + # correspondence tables A:C1 and B:C1, but here for the + # correspondence tables C(k-1):Ck and B:Ck. For the value of + # j that satisfies the if statement, the matrix ZZ created in + # the previous if statement is used. + if (j == (length(R) - 2)) { + + t <- match(ZZ[, ncol(ZZ)], ZZ[, ncol(ZZ)]) + v1 <- sequence(rle(sort(t))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z <- lapply(v1, function(x) { + ZZ[order(t)[x], , drop = FALSE] + }) + + t1 <- match(R[[length(R)]][, 2], ZZ[, ncol(ZZ)]) + v1 <- sequence(rle(sort(t1))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z1 <- lapply(v1, function(x) { + R[[length(R)]][order(t1)[x], 1:2, drop = FALSE] + }) + + if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, + 2])) == TRUE)) > 0) { + if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, + 2])) == TRUE)) == 1) { + M4 <- matrix(c(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, + 2])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, + 2])), ]), ncol = ncol(ZZ) + 2) + } else { + M4 <- cbind(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, + 2])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, + 2])), ]) + } + } else { + M4 = matrix(0, 1, 2 * length(R)) + M4 = M4[FALSE, ] + } + + + if (nrow(M4) != 0) { + for (times in 1:(2 * length(R) - ncol(M4))) { + + M4 <- cbind(M4, "") + + } + } + + Z <- Z[!is.na(match(lapply(Z, function(x) { + unique(x[, ncol(ZZ)]) + }), lapply(Z1, function(x) { + unique(x[, 2]) + })))] + + a <- lapply(Z, function(x) { + 1:nrow(x) + }) + a1 <- lapply(Z1, function(x) { + 1:nrow(x) + }) + + aa <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z, Map(expand.grid, a, a1)), function(x) { + matrix(x, ncol = ncol(ZZ)) + }) + aa1 <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z1, Map(expand.grid, a1, a)), function(x) { + matrix(x, ncol = 2) + }) + + ZZ <- do.call(rbind, Map(cbind, aa, aa1)) + + } + } + + if (is.null(dim(ZZ))) { + F_AtoB[[counter]] <- rbind(M1, M2, do.call(rbind, M), M4) + } else { + F_AtoB[[counter]] <- rbind(cbind(i, ZZ[, 1], ZZ), M1, M2, do.call(rbind, + M), M4) + } + } + } + + # Create the desired correspondence table for the selected element of + # classification A. + F_AtoB <- do.call(rbind, F_AtoB) + + # Keep in F the classifications A, C1, C2, ..., Ck, B once, based on + # the number of the correspondence tables. + if (length(R) == 2) { + F_AtoB <- F_AtoB[, c(1, 2, 3)] + } + if (length(R) == 3) { + F_AtoB <- F_AtoB[, c(1, 2, 4, 5)] + } + if (length(R) >= 4) { + F_AtoB <- F_AtoB[, sort(c(1, seq(2, 2 * length(R) - 2, 2), 2 * length(R) - + 1))] + } + + # Convert classifications as well as correspondence tables so as to + # move from classification B to classification A. Until the next + # comment, all the lines are the same as in the case that we move from + # classification A to classification B. + RRR_BtoA <- RRR[c(rev(1:(k + 2)), rev(utils::tail(c(1:length(RRR)), (length(RRR) - + 1)/2)))] + if (length(rev(utils::tail(c(1:length(RR)), (length(RR) - 1)/2))) >= 3) { + for (rev in (k + 4):(length(RRR_BtoA) - 1)) { + column_2 <- RRR_BtoA[[rev]][, 2] + RRR_BtoA[[rev]][, 2] <- RRR_BtoA[[rev]][, 1] + RRR_BtoA[[rev]][, 1] <- column_2 + } + } + + RR <- lapply(RRR_BtoA, function(x) { + matrix(unlist(x), ncol = ncol(x)) + }) + + R <- RR[utils::tail(c(1:length(RR)), (length(RR) - 1)/2)] + + F_BtoA <- list() + + counter <- 0 + message("\n") + if (length(R) == 2) { + + message("Percentage of codes of ", colnames(RRR_BtoA[[1]][1]), " processed:") + pb <- txtProgressBar(min = 0, max = 100, style = 3, width = 50, char = "=") + + for (i in unique(R[[1]][, 1])) { + + counter <- counter + 1 + setTxtProgressBar(pb, round(counter/length(unique(R[[1]][, 1])) * 100, digits = 0)) + + x1 <- R[[1]][which(R[[1]][, 1] == i), 2] + TT <- matrix(R[[1]][which(R[[1]][, 1] == i), 1:2], ncol = 2) + T <- matrix(R[[2]][!is.na(match(R[[2]][, 2], x1)), 1:2], ncol = 2) + + t <- match(T[, 2], T[, 2]) + v1 <- sequence(rle(sort(t))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z <- lapply(v1, function(x) { + T[order(t)[x], , drop = FALSE] + }) + + t1 <- match(TT[, 2], T[, 2]) + v1 <- sequence(rle(sort(t1))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z1 <- lapply(v1, function(x) { + TT[order(t1)[x], , drop = FALSE] + }) + + Z <- Z[!is.na(match(lapply(Z, function(x) { + unique(x[, 2]) + }), lapply(Z1, function(x) { + unique(x[, 2]) + })))] + + a <- lapply(Z, function(x) { + 1:nrow(x) + }) + a1 <- lapply(Z1, function(x) { + 1:nrow(x) + }) + aa <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z, Map(expand.grid, a, a1)), function(x) { + matrix(x, ncol = 2) + }) + aa1 <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z1, Map(expand.grid, a1, a)), function(x) { + matrix(x, ncol = 2) + }) + ZZ <- do.call(rbind, Map(cbind, aa1, aa)) + + t1 <- matrix(TT[is.na(match(TT[, 2], ZZ[, 2])), ], ncol = 2) + ZZ <- rbind(ZZ, cbind(t1, matrix("", nrow = nrow(t1), ncol = 2))) + + F_BtoA[[counter]] <- ZZ + + } + } + + if (length(R) == 3) { + message("Percentage of codes of ", colnames(RRR_BtoA[[1]][1]), " processed:") + pb <- txtProgressBar(min = 0, max = 100, style = 3, width = 50, char = "=") + for (i in unique(R[[1]][, 1])) { + + counter <- counter + 1 + setTxtProgressBar(pb, round(counter/length(unique(R[[1]][, 1])) * 100, digits = 0)) + + x1 <- R[[1]][which(R[[1]][, 1] == i), 2] + T <- matrix(R[[2]][!is.na(match(R[[2]][, 1], x1)), 1:2], ncol = 2) + + if (length(which(is.na(match(x1, T[, 1])) == TRUE)) > 0) { + M1 <- matrix(matrix(R[[1]][which(R[[1]][, 1] == i), 1:2], ncol = 2)[is.na(match(x1, + T[, 1])), ], ncol = 2) + } else { + M1 = matrix(0, 1, 2 * length(R)) + M1 = M1[FALSE, ] + } + + if (nrow(M1) != 0) { + for (times in 1:(2 * length(R) - ncol(M1))) { + + M1 <- cbind(M1, "") + + } + } + + x2 <- R[[2]][!is.na(match(R[[2]][, 1], x1)), 2] + T1 <- matrix(R[[3]][!is.na(match(R[[3]][, 2], x2)), 1:2], ncol = 2) + + if (length(which(is.na(match(x2, T1[, 2])) == TRUE)) > 0) { + if (length(which(is.na(match(x2, T1[, 2])) == TRUE)) == 1) { + M2 <- matrix(c(i, T[is.na(match(x2, T1[, 2])), 1], T[is.na(match(x2, + T1[, 2])), ]), ncol = 4) + } else { + M2 <- cbind(i, T[is.na(match(x2, T1[, 2])), 1], T[is.na(match(x2, + T1[, 2])), ]) + } + } else { + M2 = matrix(0, 1, 2 * length(R)) + M2 = M2[FALSE, ] + } + + if (nrow(M2) != 0) { + for (times in 1:(2 * length(R) - ncol(M2))) { + + M2 <- cbind(M2, "") + + } + } + + t <- match(T[, 2], T[, 2]) + v1 <- sequence(rle(sort(t))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z <- lapply(v1, function(x) { + T[order(t)[x], , drop = FALSE] + }) + + t1 <- match(T1[, 2], T[, 2]) + v1 <- sequence(rle(sort(t1))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z1 <- lapply(v1, function(x) { + T1[order(t1)[x], , drop = FALSE] + }) + + Z <- Z[!is.na(match(lapply(Z, function(x) { + unique(x[, 2]) + }), lapply(Z1, function(x) { + unique(x[, 2]) + })))] + + a <- lapply(Z, function(x) { + 1:nrow(x) + }) + a1 <- lapply(Z1, function(x) { + 1:nrow(x) + }) + aa <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z, Map(expand.grid, a, a1)), function(x) { + matrix(x, ncol = 2) + }) + aa1 <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z1, Map(expand.grid, a1, a)), function(x) { + matrix(x, ncol = 2) + }) + ZZ <- do.call(rbind, Map(cbind, aa, aa1)) + + if (is.null(dim(ZZ))) { + F_BtoA[[counter]] <- rbind(M1, M2) + } else { + F_BtoA[[counter]] <- rbind(cbind(i, ZZ[, 1], ZZ), M1, M2) + } + + } + + } + M <- list() + if (length(R) >= 4) { + message("Percentage of codes of ", colnames(RRR_BtoA[[1]][1]), " processed:") + pb <- txtProgressBar(min = 0, max = 100, style = 3, width = 50, char = "=") + for (i in unique(R[[1]][, 1])) { + + counter <- counter + 1 + setTxtProgressBar(pb, round(counter/length(unique(R[[1]][, 1])) * 100, digits = 0)) + + for (j in 1:(length(R) - 2)) { + if (j == 1) { + + x1 <- R[[j]][which(R[[j]][, 1] == i), 2] + T <- matrix(R[[j + 1]][!is.na(match(R[[j + 1]][, 1], x1)), 1:2], + ncol = 2) + + if (length(which(is.na(match(x1, T[, 1])) == TRUE)) > 0) { + M1 <- matrix(matrix(R[[j]][which(R[[j]][, 1] == i), 1:2], ncol = 2)[is.na(match(x1, + T[, 1])), ], ncol = 2) + } else { + M1 = matrix(0, 1, 2 * length(R)) + M1 = M1[FALSE, ] + } + + if (nrow(M1) != 0) { + for (times in 1:(2 * length(R) - ncol(M1))) { + + M1 <- cbind(M1, "") + + } + } + + x2 <- R[[j + 1]][!is.na(match(R[[j + 1]][, 1], x1)), 2] + T1 <- matrix(R[[j + 2]][!is.na(match(R[[j + 2]][, 1], x2)), 1:2], + ncol = 2) + + if (length(which(is.na(match(x2, T1[, 1])) == TRUE)) > 0) { + + if (length(which(is.na(match(x2, T1[, 1])) == TRUE)) == 1) { + M2 <- matrix(c(i, T[is.na(match(x2, T1[, 1])), 1], T[is.na(match(x2, + T1[, 1])), ]), ncol = 4) + + } else { + M2 <- cbind(i, T[is.na(match(x2, T1[, 1])), 1], T[is.na(match(x2, + T1[, 1])), ]) + } + } else { + M2 = matrix(0, 1, 2 * length(R)) + M2 = M2[FALSE, ] + } + + if (nrow(M2) != 0) { + for (times in 1:(2 * length(R) - ncol(M2))) { + + M2 <- cbind(M2, "") + + } + } + + t <- match(T[, 2], T[, 2]) + v1 <- sequence(rle(sort(t))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z <- lapply(v1, function(x) { + T[order(t)[x], , drop = FALSE] + }) + + t1 <- match(T1[, 1], T[, 2]) + v1 <- sequence(rle(sort(t1))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z1 <- lapply(v1, function(x) { + T1[order(t1)[x], , drop = FALSE] + }) + + Z <- Z[!is.na(match(lapply(Z, function(x) { + unique(x[, 2]) + }), lapply(Z1, function(x) { + unique(x[, 1]) + })))] + + a <- lapply(Z, function(x) { + 1:nrow(x) + }) + a1 <- lapply(Z1, function(x) { + 1:nrow(x) + }) + aa <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z, Map(expand.grid, a, a1)), function(x) { + matrix(x, ncol = 2) + }) + aa1 <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z1, Map(expand.grid, a1, a)), function(x) { + matrix(x, ncol = 2) + }) + ZZ <- do.call(rbind, Map(cbind, aa, aa1)) + + } + + if (j >= 2 && j <= (length(R) - 3) && length(R) != 4) { + + t <- match(ZZ[, ncol(ZZ)], ZZ[, ncol(ZZ)]) + v1 <- sequence(rle(sort(t))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z <- lapply(v1, function(x) { + ZZ[order(t)[x], , drop = FALSE] + }) + + t1 <- match(R[[j + 2]][, 1], ZZ[, ncol(ZZ)]) + v1 <- sequence(rle(sort(t1))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z1 <- lapply(v1, function(x) { + R[[j + 2]][order(t1)[x], 1:2, drop = FALSE] + }) + + if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])) == + TRUE)) > 0) { + if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])) == + TRUE)) == 1) { + M3 <- matrix(c(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, + 1])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])), + ]), ncol = ncol(ZZ) + 2) + } else { + M3 <- cbind(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, + 1])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[j + 2]][, 1])), + ]) + } + } else { + M3 = matrix(0, 1, 2 * length(R)) + M3 = M3[FALSE, ] + } + + if (nrow(M3) != 0) { + for (times in 1:(2 * length(R) - ncol(M3))) { + + M3 <- cbind(M3, "") + + } + } + M[[j - 1]] <- M3 + + Z <- Z[!is.na(match(lapply(Z, function(x) { + unique(x[, ncol(ZZ)]) + }), lapply(Z1, function(x) { + unique(x[, 1]) + })))] + + a <- lapply(Z, function(x) { + 1:nrow(x) + }) + a1 <- lapply(Z1, function(x) { + 1:nrow(x) + }) + + aa <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z, Map(expand.grid, a, a1)), function(x) { + matrix(x, ncol = ncol(ZZ)) + }) + aa1 <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z1, Map(expand.grid, a1, a)), function(x) { + matrix(x, ncol = 2) + }) + + ZZ <- do.call(rbind, Map(cbind, aa, aa1)) + + } + + if (j == (length(R) - 2)) { + + t <- match(ZZ[, ncol(ZZ)], ZZ[, ncol(ZZ)]) + v1 <- sequence(rle(sort(t))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z <- lapply(v1, function(x) { + ZZ[order(t)[x], , drop = FALSE] + }) + + t1 <- match(R[[length(R)]][, 2], ZZ[, ncol(ZZ)]) + v1 <- sequence(rle(sort(t1))$lengths) + v1 <- split(seq_along(v1), cumsum(v1 == 1)) + Z1 <- lapply(v1, function(x) { + R[[length(R)]][order(t1)[x], 1:2, drop = FALSE] + }) + + if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, + 2])) == TRUE)) > 0) { + if (length(which(is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, + 2])) == TRUE)) == 1) { + M4 <- matrix(c(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, + 2])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, + 2])), ]), ncol = ncol(ZZ) + 2) + } else { + M4 <- cbind(i, ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, + 2])), 1], ZZ[is.na(match(ZZ[, ncol(ZZ)], R[[length(R)]][, + 2])), ]) + } + } else { + M4 = matrix(0, 1, 2 * length(R)) + M4 = M4[FALSE, ] + } + + + if (nrow(M4) != 0) { + for (times in 1:(2 * length(R) - ncol(M4))) { + + M4 <- cbind(M4, "") + + } + } + + Z <- Z[!is.na(match(lapply(Z, function(x) { + unique(x[, ncol(ZZ)]) + }), lapply(Z1, function(x) { + unique(x[, 2]) + })))] + + a <- lapply(Z, function(x) { + 1:nrow(x) + }) + a1 <- lapply(Z1, function(x) { + 1:nrow(x) + }) + + aa <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z, Map(expand.grid, a, a1)), function(x) { + matrix(x, ncol = ncol(ZZ)) + }) + aa1 <- lapply(Map(function(x, y) { + x[y[, 1], ] + }, Z1, Map(expand.grid, a1, a)), function(x) { + matrix(x, ncol = 2) + }) + + ZZ <- do.call(rbind, Map(cbind, aa, aa1)) + + } + } + + if (is.null(dim(ZZ))) { + F_BtoA[[counter]] <- rbind(M1, M2, do.call(rbind, M), M4) + } else { + F_BtoA[[counter]] <- rbind(cbind(i, ZZ[, 1], ZZ), M1, M2, do.call(rbind, + M), M4) + } + + + } + } + + F_BtoA <- do.call(rbind, F_BtoA) + + if (length(R) == 2) { + F_BtoA <- F_BtoA[, c(1, 2, 3)] + } + if (length(R) == 3) { + F_BtoA <- F_BtoA[, c(1, 2, 4, 5)] + } + if (length(R) >= 4) { + F_BtoA <- F_BtoA[, sort(c(1, seq(2, 2 * length(R) - 2, 2), 2 * length(R) - + 1))] + } + + + F_BtoA <- F_BtoA[, rev(1:ncol(F_BtoA))] + # Combine the results from moving from classification A to B, and vice + # versa. F_AtoB + keep <- 0 + keepF_AtoB <- c(0) + for (iterr in 1:nrow(F_AtoB)) { + + if (F_AtoB[iterr, 1] != "") { + blanks <- F_AtoB[iterr, ] == "" + + if (all(blanks == FALSE)) { + keep <- keep + 1 + keepF_AtoB[keep] <- iterr + } else { + blanks = which(F_AtoB[iterr, ] == "") + if (all(c(blanks[1]:ncol(F_AtoB)) == "")) { + keep <- keep + 1 + keepF_AtoB[keep] <- iterr + } + } + + } + } + + NoNullF_AtoB <- matrix(F_AtoB[keepF_AtoB, ], ncol = k + 2) + if (nrow(NoNullF_AtoB) != nrow(F_AtoB)) { + if (length(keepF_AtoB) == 1 && keepF_AtoB == c(0)) { + FNullAtoB <- matrix(F_AtoB, ncol = k + 2) + for (iter in 1:nrow(FNullAtoB)) { + FNullAtoB[iter, (which(FNullAtoB[iter, ] == "")[1]):(k + 2)] <- "" + } + } else { + FNullAtoB <- matrix(F_AtoB[-keepF_AtoB, ], ncol = k + 2) + for (iter in 1:nrow(FNullAtoB)) { + FNullAtoB[iter, (which(FNullAtoB[iter, ] == "")[1]):(k + 2)] <- "" + } + } + } else { + FNullAtoB <- matrix(0, 1, k + 2) + FNullAtoB <- FNullAtoB[FALSE, ] + } + + # F_BtoA + keep <- 0 + keepF_BtoA <- c(0) + for (iterr in 1:nrow(F_BtoA)) { + + if (F_BtoA[iterr, ncol(F_AtoB)] != "") { + blanks <- F_BtoA[iterr, ] == "" + + if (all(blanks == FALSE)) { + keep <- keep + 1 + keepF_BtoA[keep] <- iterr + } else { + blanks <- which(F_BtoA[iterr, ] == "") + if (all(c(1:length(blanks)) == "")) { + keep <- keep + 1 + keepF_BtoA[keep] <- iterr + } + } + + } + } + + # Combine all together + + NoNullF_BtoA <- matrix(F_BtoA[keepF_BtoA, ], ncol = k + 2) + if (nrow(NoNullF_BtoA) != nrow(F_BtoA)) { + if (length(keepF_BtoA) == 1 && keepF_BtoA == c(0)) { + FNullBtoA <- matrix(F_BtoA, ncol = k + 2) + for (iter in 1:nrow(FNullBtoA)) { + FNullBtoA[iter, (which(FNullBtoA[iter, ] == "")[length(which(FNullBtoA[iter, + ] == ""))]):1] <- "" + } + } else { + FNullBtoA <- matrix(F_BtoA[-keepF_BtoA, ], ncol = k + 2) + for (iter in 1:nrow(FNullBtoA)) { + FNullBtoA[iter, (which(FNullBtoA[iter, ] == "")[length(which(FNullBtoA[iter, + ] == ""))]):1] <- "" + } + } + } else { + FNullBtoA <- matrix(0, 1, k + 2) + FNullBtoA <- FNullBtoA[FALSE, ] + } + + F <- unique(rbind(NoNullF_AtoB, NoNullF_BtoA)) + F <- unique(rbind(F, unique(FNullAtoB), unique(FNullBtoA))) + if (length(which(apply(F, 1, function(x) { + length(which(x == "")) + } == k + 2) == TRUE)) >= 1) { + F <- F[-which(apply(F, 1, function(x) { + length(which(x == "")) + } == k + 2) == TRUE), ] + } + + # The if statement is based on which of classifications A or B is the + # reference one (if any). + if (length(which(apply(F, 1, function(x) { + length(which(x == "")) + }) == 0)) >= 1) { + + if (Reference == "A") { + idx <- k + 5 + + # Creation of the review flag for the correspondence table A:B. + F1 <- matrix(F[apply(F, 1, function(x) { + length(which(x == "")) + }) == 0, ], ncol = k + 2) + F2 <- F[apply(F, 1, function(x) { + length(which(x == "")) + }) >= 1, ] + F2 <- matrix(unlist(F2), ncol = k + 2) + f <- stats::aggregate(matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, 2], + list(num = matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, 2]), + length)[which(stats::aggregate(matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, + 2], list(num = matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, + 2]), length)[, 2] > 1), 1] + reviewF1 <- rep(0, nrow(F1)) + reviewF1[which(F1[, ncol(F1)] %in% f)] <- 1 + Review <- data.frame(cbind(rbind(F1, F2), c(reviewF1, rep(0, nrow(F2))))) + + # Creation of the redundancy flag for the correspondence table + # A:B. + F1 <- Review[apply(Review, 1, function(x) { + length(which(x == "")) + }) == 0, ] + F1 <- matrix(unlist(F1), ncol = k + 3) + F1 <- data.frame(F1) + colnames(F1) <- colnames(Review) + F2 <- Review[apply(Review, 1, function(x) { + length(which(x == "")) + }) >= 1, ] + F2 <- matrix(unlist(F2), ncol = k + 3) + F2 <- data.frame(F2) + colnames(F2) <- colnames(F1) + f1 <- stats::aggregate(F1[, c(1, ncol(F1) - 1)], by = F1[, c(1, ncol(F1) - + 1)], length)[1:(ncol(F1[, c(1, ncol(F1) - 1)]) + 1)][which(stats::aggregate(F1[, + c(1, ncol(F1) - 1)], by = F1[, c(1, ncol(F1) - 1)], length)[1:(ncol(F1[, + c(1, ncol(F1) - 1)]) + 1)][, 3] >= 2), 1:2] + redundancyF1 <- rep(0, nrow(F1)) + redundancyF1[which(apply(F1[, c(1, ncol(F1) - 1)], 1, paste, collapse = "") %in% + apply(f1, 1, paste, collapse = ""))] <- 1 + correspondenceAB <- data.frame(cbind(rbind(F1, F2), c(redundancyF1, + rep(0, nrow(F2))))) + + # Creation of the unmatched flag for the correspondence table + # A:B. + correspondenceAB <- data.frame(correspondenceAB, 1) + colnames(correspondenceAB) <- c(paste(colnames(RRR[[1]][1])), paste(unlist(lapply(RRR, + function(x) { + colnames(x)[1] + }))[seq(k) + 1]), paste(colnames(RRR[[k + 2]][1])), "Review", "Redundancy", + "Unmatched") + + } else if (Reference == "B") { + idx <- k + 5 + + # Creation of the review flag for the correspondence table A:B. + F1 <- matrix(F[apply(F, 1, function(x) { + length(which(x == "")) + }) == 0, ], ncol = k + 2) + F2 <- F[apply(F, 1, function(x) { + length(which(x == "")) + }) >= 1, ] + F2 <- matrix(unlist(F2), ncol = k + 2) + f <- stats::aggregate(matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, 1], + list(num = matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, 1]), + length)[which(stats::aggregate(matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, + 1], list(num = matrix(unique(F1[, c(1, ncol(F1))]), ncol = 2)[, + 1]), length)[, 2] > 1), 1] + reviewF1 <- rep(0, nrow(F1)) + reviewF1[which(F1[, 1] %in% f)] <- 1 + Review <- data.frame(cbind(rbind(F1, F2), c(reviewF1, rep(0, nrow(F2))))) + + # Creation of the redundancy flag for the correspondence table + # A:B. + F1 <- Review[apply(Review, 1, function(x) { + length(which(x == "")) + }) == 0, ] + F1 <- matrix(unlist(F1), ncol = k + 3) + F1 <- data.frame(F1) + colnames(F1) <- colnames(Review) + F2 <- Review[apply(Review, 1, function(x) { + length(which(x == "")) + }) >= 1, ] + F2 <- matrix(unlist(F2), ncol = k + 3) + F2 <- data.frame(F2) + colnames(F2) <- colnames(F1) + f1 <- stats::aggregate(F1[, c(1, ncol(F1) - 1)], by = F1[, c(1, ncol(F1) - + 1)], length)[1:(ncol(F1[, c(1, ncol(F1) - 1)]) + 1)][which(stats::aggregate(F1[, + c(1, ncol(F1) - 1)], by = F1[, c(1, ncol(F1) - 1)], length)[1:(ncol(F1[, + c(1, ncol(F1) - 1)]) + 1)][, 3] >= 2), 1:2] + redundancyF1 <- rep(0, nrow(F1)) + redundancyF1[which(apply(F1[, c(1, ncol(F1) - 1)], 1, paste, collapse = "") %in% + apply(f1, 1, paste, collapse = ""))] <- 1 + correspondenceAB <- data.frame(cbind(rbind(F1, F2), c(redundancyF1, + rep(0, nrow(F2))))) + + # Creation of the unmatched flag for the correspondence table + # A:B. + correspondenceAB <- data.frame(correspondenceAB, 1) + colnames(correspondenceAB) <- c(paste(colnames(RRR[[1]][1])), paste(unlist(lapply(RRR, + function(x) { + colnames(x)[1] + }))[seq(k) + 1]), paste(colnames(RRR[[k + 2]][1])), "Review", "Redundancy", + "Unmatched") + + } else if (Reference == "none") { + idx <- k + 4 + + # Creation of the redundancy flag for the correspondence table + # A:B. + F1 <- data.frame(F[apply(F, 1, function(x) { + length(which(x == "")) + }) == 0, ]) + F1 <- matrix(unlist(F1), ncol = k + 2) + F1 <- data.frame(F1) + F2 <- data.frame(F[apply(F, 1, function(x) { + length(which(x == "")) + }) >= 1, ]) + F2 <- matrix(unlist(F2), ncol = k + 2) + F2 <- data.frame(F2) + colnames(F2) <- colnames(F1) + f1 <- stats::aggregate(F1[, c(1, ncol(F1))], by = F1[, c(1, ncol(F1))], + length)[1:(ncol(F1[, c(1, ncol(F1))]) + 1)][which(stats::aggregate(F1[, + c(1, ncol(F1))], by = F1[, c(1, ncol(F1))], length)[1:(ncol(F1[, + c(1, ncol(F1))]) + 1)][, 3] >= 2), 1:2] + redundancyF1 <- rep(0, nrow(F1)) + redundancyF1[which(apply(F1[, c(1, ncol(F1))], 1, paste, collapse = "") %in% + apply(f1, 1, paste, collapse = ""))] <- 1 + correspondenceAB <- data.frame(cbind(rbind(F1, F2), c(redundancyF1, + rep(0, nrow(F2))))) + + # Creation of the unmatched flag for the correspondence table + # A:B. + correspondenceAB <- data.frame(correspondenceAB, 1) + colnames(correspondenceAB) <- c(paste(colnames(RRR[[1]][1])), paste(unlist(lapply(RRR, + function(x) { + colnames(x)[1] + }))[seq(k) + 1]), paste(colnames(RRR[[k + 2]][1])), "Redundancy", + "Unmatched") + + } + } else { + if (Reference %in% c("A", "B")) { + Review <- rep(0, nrow(F)) + Redundancy <- rep(0, nrow(F)) + Unmatched <- rep(1, nrow(F)) + correspondenceAB <- data.frame(cbind(F, Review, Redundancy, Unmatched)) + colnames(correspondenceAB) <- c(paste(colnames(RRR[[1]][1])), paste(unlist(lapply(RRR, + function(x) { + colnames(x)[1] + }))[seq(k) + 1]), paste(colnames(RRR[[k + 2]][1])), "Review", "Redundancy", + "Unmatched") + } + if (Reference == "none") { + Redundancy <- rep(0, nrow(F)) + Unmatched <- rep(1, nrow(F)) + correspondenceAB <- data.frame(cbind(F, Redundancy, Unmatched)) + colnames(correspondenceAB) <- c(paste(colnames(RRR[[1]][1])), paste(unlist(lapply(RRR, + function(x) { + colnames(x)[1] + }))[seq(k) + 1]), paste(colnames(RRR[[k + 2]][1])), "Redundancy", + "Unmatched") + } + } + + + # The final Unmatched and the NoMatchFrom flags are created + NoMatchFromA <- rep("", nrow(correspondenceAB)) + NoMatchFromB <- rep("", nrow(correspondenceAB)) + correspondenceAB <- cbind(correspondenceAB, NoMatchFromA, NoMatchFromB) + + inA <- which(is.na(match(unlist(RRR[[1]][, 1]), correspondenceAB[, 1])) == TRUE) + if (length(inA) >= 1) { + InA <- cbind(matrix(RRR[[1]][inA, 1], length(inA), 1), matrix("", length(inA), + idx - 1)) + InA <- cbind(InA, matrix("", length(inA), 2)) + InA <- data.frame(InA) + colnames(InA) <- colnames(correspondenceAB) + correspondenceAB <- rbind(correspondenceAB, InA) + } + + inB <- which(is.na(match(unlist(RRR[[nrow(x)]][, 1]), correspondenceAB[, k + 2])) == + TRUE) + if (length(inB) >= 1) { + InB <- cbind(matrix("", length(inB), k + 1), matrix(RRR[[nrow(x)]][inB, + 1], length(inB), 1), matrix("", length(inB), idx - k - 2)) + InB <- cbind(InB, matrix("", length(inB), 2)) + InB <- data.frame(InB) + colnames(InB) <- colnames(correspondenceAB) + correspondenceAB <- rbind(correspondenceAB, InB) + } + + yesA <- which(!is.na(match(correspondenceAB[, 1], unlist(RRR[[1]][, 1]))) == TRUE) + yesAC1 <- which(!is.na(match(correspondenceAB[, 1], unlist(RRR[[nrow(x) + 1]][, + 1]))) == TRUE) + noAC1 <- which(is.na(match(correspondenceAB[, 1], unlist(RRR[[nrow(x) + 1]][, 1]))) == + TRUE) + + correspondenceAB$NoMatchFromA[intersect(yesA, yesAC1)] <- 0 + correspondenceAB$NoMatchFromA[intersect(yesA, noAC1)] <- 1 + + yesB <- which(!is.na(match(correspondenceAB[, k + 2], unlist(RRR[[nrow(x)]][, 1]))) == + TRUE) + yesBCk <- which(!is.na(match(correspondenceAB[, k + 2], unlist(RRR[[length(RRR)]][, + 1]))) == TRUE) + noBCk <- which(is.na(match(correspondenceAB[, k + 2], unlist(RRR[[length(RRR)]][, + 1]))) == TRUE) + + correspondenceAB$NoMatchFromB[intersect(yesB, yesBCk)] <- 0 + correspondenceAB$NoMatchFromB[intersect(yesB, noBCk)] <- 1 + + yesFinalA <- which(correspondenceAB[, 1] != "") + yesFinalB <- which(correspondenceAB[, k + 2] != "") + correspondenceAB$Unmatched <- 1 + correspondenceAB$Unmatched[intersect(yesFinalA, yesFinalB)] <- 0 + + if ((Reference %in% c("A", "B"))) { + correspondenceAB$Review[which(correspondenceAB[, 1] == "")] <- "" + correspondenceAB$Review[which(correspondenceAB[, k + 2] == "")] <- "" + } + + # Final redundancy flag + correspondenceAB$Redundancy <- 0 + f1 <- stats::aggregate(correspondenceAB[, c(1, k + 2)], by = correspondenceAB[, + c(1, k + 2)], length)[1:(ncol(correspondenceAB[, c(1, k + 2)]) + 1)][which(stats::aggregate(correspondenceAB[, + c(1, k + 2)], by = correspondenceAB[, c(1, k + 2)], length)[1:(ncol(correspondenceAB[, + c(1, k + 2)]) + 1)][, 3] >= 2), 1:2] + correspondenceAB$Redundancy[which(apply(correspondenceAB[, c(1, k + 2)], + 1, paste, collapse = " ") %in% apply(f1, 1, paste, collapse = " "))] <- 1 + + }, error = function(e) { + stop(simpleError(paste("An error has occurred and execution needs to stop. Please check the input data. \n Details line 1734:\n",e))) + }) + + message("\n") + + # Check the number of the unmatched codes. + if (length(which(as.vector(correspondenceAB$Unmatched) == 1))/nrow(correspondenceAB) < + MismatchTolerance) { + + tryCatch({ + + # The following if statement is applied if there are any + # supplementary information for the classification A, in order to + # be adjusted next to the correspondence table A:B. + if (ncol(RRR[[1]]) >= 2) { + A1 <- RRR[[1]][match(correspondenceAB[, 1], unlist(RRR[[1]][, 1])), + 2:ncol(RRR[[1]])] + A1[is.na(A1)] <- "" + A1 <- matrix(unlist(A1), ncol = length(2:ncol(RRR[[1]]))) + colnames(A1) <- paste(paste(colnames(RRR[[1]])[1]), colnames(RRR[[1]])[2:ncol(RRR[[1]])], + sep = "_") + correspondenceAB <- cbind(correspondenceAB, A1) + } + + # The following for loop is applied for the classfications C1, C2, + # ..., Ck. + for (i1 in c(2:(((length(RRR) + 1)/2) - 1))) { + + # The if statement is applied if there are any supplementary + # information for the classfications C1, C2, ..., Ck, in order + # to be adjusted next to the correspondence table A:B. + if (ncol(RRR[[i1]]) >= 2) { + A1 <- RRR[[i1]][match(correspondenceAB[, i1], unlist(RRR[[i1]][, + 1])), 2:ncol(RRR[[i1]])] + A1[is.na(A1)] <- "" + A1 <- matrix(unlist(A1), ncol = length(2:ncol(RRR[[i1]]))) + colnames(A1) <- paste(paste(colnames(RRR[[i1]])[1]), colnames(RRR[[i1]])[2:ncol(RRR[[i1]])], + sep = "_") + correspondenceAB <- cbind(correspondenceAB, A1) + } + + } + + # The following if statement is applied if there are any + # supplementary information for the classification B, in order to + # be adjusted next to the correspondence table A:B. + if (ncol(RRR[[(length(RRR) + 1)/2]]) >= 2) { + A1 <- RRR[[(length(RRR) + 1)/2]][match(correspondenceAB[, (length(RRR) + + 1)/2], unlist(RRR[[(length(RRR) + 1)/2]][, 1])), 2:ncol(RRR[[(length(RRR) + + 1)/2]])] + A1[is.na(A1)] <- "" + A1 <- matrix(unlist(A1), ncol = length(2:ncol(RRR[[(length(RRR) + + 1)/2]]))) + colnames(A1) <- paste(paste(colnames(RRR[[k + 2]])[1]), colnames(RRR[[(length(RRR) + + 1)/2]])[2:ncol(RRR[[(length(RRR) + 1)/2]])], sep = "_") + correspondenceAB <- cbind(correspondenceAB, A1) + } + + # Find which .csv files are the correspondence tables. + Tail <- utils::tail(c(1:length(RRR)), (length(RRR) - 1)/2) + + # The following if statement is applied if there are any + # supplementary information for the correspondence table A:C1, in + # order to be adjusted next to the correspondence table A:B. + if (ncol(RRR[[Tail[1]]]) >= 3) { + A1 <- RRR[[Tail[1]]][match(data.frame(t(correspondenceAB[, 1:2])), + data.frame(t(RRR[[Tail[1]]][, 1:2]))), 3:ncol(RRR[[Tail[1]]])] + A1[is.na(A1)] <- "" + A1 <- matrix(unlist(A1), ncol = length(3:ncol(RRR[[Tail[1]]]))) + colnames(A1) <- paste(paste(colnames(RRR[[Tail[1]]])[1]), colnames(RRR[[Tail[1]]])[3:ncol(RRR[[Tail[1]]])], + sep = "_") + correspondenceAB <- cbind(correspondenceAB, A1) + } + + # The following if statement is applied if there are any + # supplementary information for the correspondence tables (C1:C2 - + # C2:C3), (C2:C3 - C3:C4), ..., (C(k-2):C(k-1) - C(k-1):Ck), in + # order to be adjusted next to the correspondence table A:B. + if (length(Tail) >= 3) { + for (i2 in 2:(length(Tail) - 1)) { + if (ncol(RRR[[Tail[i2]]]) >= 3) { + A1 <- RRR[[Tail[i2]]][match(data.frame(t(correspondenceAB[, c(i2, + i2 + 1)])), data.frame(t(RRR[[Tail[i2]]][, 1:2]))), 3:ncol(RRR[[Tail[i2]]])] + A1[is.na(A1)] <- "" + A1 <- matrix(unlist(A1), ncol = length(3:ncol(RRR[[Tail[i2]]]))) + colnames(A1) <- paste(paste(colnames(RRR[[Tail[i2]]])[1]), colnames(RRR[[Tail[i2]]])[3:ncol(RRR[[Tail[i2]]])], + sep = "_") + correspondenceAB <- cbind(correspondenceAB, A1) + } + } + } + + # The following if statement is applied if there are any + # supplementary information for the correspondence table B:Ck, in + # order to be adjusted next to the correspondence table A:B. + if (ncol(RRR[[Tail[length(Tail)]]]) >= 3) { + A1 <- RRR[[Tail[length(Tail)]]][match(data.frame(t(correspondenceAB[, + c(((length(RRR) + 1)/2) - 1, (length(RRR) + 1)/2)])), data.frame(t(RRR[[Tail[length(Tail)]]][, + c(2, 1)]))), 3:ncol(RRR[[Tail[length(Tail)]]])] + A1[is.na(A1)] <- "" + A1 <- matrix(unlist(A1), ncol = length(3:ncol(RRR[[Tail[length(Tail)]]]))) + colnames(A1) <- paste(paste(colnames(RRR[[Tail[length(Tail)]]])[1]), + colnames(RRR[[Tail[length(Tail)]]])[3:ncol(RRR[[Tail[length(Tail)]]])], + sep = "_") + correspondenceAB <- cbind(correspondenceAB, A1) + } + }, error = function(e) { + stop(simpleError(paste("An error has occurred and execution needs to stop. Please check the input data. \n Details line 1841: \n",e))) + }) + + } else { + # Error message in case the percentage of unmatched codes between A and + # B is larger than the desired threshold. + stop("Too many codes in either of classifications A and B cannot be mapped to any code in the other one.\n", + round(length(which(as.vector(correspondenceAB$Unmatched) == 1))/nrow(correspondenceAB)*100,2),"% is unmatched which exceeds the mismatch tolerance of ", MismatchTolerance) + } + + + tryCatch({ + + # The final correspondence table A:B is sorted, firstly, based on + # classification A, and then, based on classification B. + correspondenceAB <- correspondenceAB[order(correspondenceAB[, 1], correspondenceAB[, + (length(RRR) + 1)/2], decreasing = FALSE), ] + + + # Redundancy_trim parameter (MP) + # Find the columns which are related to linking datasets which values need to be recorded as "Multiple" + ## 2 + n_link_data*2 + 1 = n_data + num_link = (length(test.names) - 3)/2 + col_multiple = numeric(0) + for (nl in 1:num_link){ + col_multiple = unique(c(col_multiple, grep(colnames(correspondenceAB)[1 + nl], colnames(correspondenceAB), value = T))) + } + max_col = num_link + 2 + + #Do redundancy trim only if redundancy is there (MP) + if (length(which(correspondenceAB$Redundancy == 1)) != 0){ + + # Find unique combination of A and B and identify them with a number + uniqueAB = unique(correspondenceAB[which(correspondenceAB$Redundancy == 1),c(1, max_col)]) + uniqueAB$id_to_use = 1:nrow(uniqueAB) + + correspondenceAB = merge(correspondenceAB, uniqueAB, by = colnames(correspondenceAB)[c(1,max_col)], all.x = TRUE)[, union(names(correspondenceAB), names(uniqueAB))] + col_link = grep("id_to_use", colnames(correspondenceAB), value = T) + + ### new but probably slower + if (Redundancy_trim == TRUE){ + x_temp = split(correspondenceAB[which(correspondenceAB$Redundancy == 1), col_multiple], correspondenceAB[which(correspondenceAB$Redundancy == 1), col_link]) + + for (i in 1:nrow(uniqueAB)){ + multiple_values = apply(x_temp[[i]], 2, function(x) length(unique(x))) + x_change = which(multiple_values != 1) + #replace with multiple + col_multiple_temp = c(col_multiple)[x_change] + correspondenceAB[which(correspondenceAB$Redundancy == 1 & correspondenceAB[,col_link] == i), unique(col_multiple_temp)] = "Multiple" + } + + correspondenceAB = correspondenceAB[,!names(correspondenceAB) %in% "id_to_use"] + + ### old but probably faster + ##replace with multiple + ##replace with multiple + #correspondenceAB[which(correspondenceAB$Redundancy == 1), unique(col_multiple)] = "Multiple" + + #eliminate duplicates + dup = as.numeric(duplicated(correspondenceAB[,1:max_col])) + correspondenceAB = correspondenceAB[which(dup == 0), ] + + } + } else { + + correspondenceAB = correspondenceAB + } + + if (Redundancy_trim==FALSE){ + #add a redundancy keep flag to indicate which row will be kept + dup = as.numeric(duplicated(correspondenceAB[,c(1,max_col)])) + red_col = which(colnames(correspondenceAB) == "Redundancy") + correspondenceAB$Redundancy_keep = rep(0, nrow(correspondenceAB)) + correspondenceAB$Redundancy_keep[which(dup == "0" & correspondenceAB$Redundancy == "1")] = 1 + correspondenceAB = correspondenceAB[,c(1:red_col, ncol(correspondenceAB), (red_col+1):(ncol(correspondenceAB)-1))] + correspondenceAB = correspondenceAB[,!names(correspondenceAB) %in% "id_to_use"] + } + + + # Create a data frame that contains the names of the classifications. + CsvNames <- data.frame(matrix(0, k + 2, 1)) + + CsvNames[1, 1] <- paste("A:", colnames(correspondenceAB)[1], sep = " ") + + CsvNames[k + 2, 1] <- paste("B:", colnames(correspondenceAB)[k + 2], sep = " ") + + for (i3 in seq(k) + 1) { + CsvNames[i3, 1] <- paste(paste("C", i3 - 1, ":", sep = ""), colnames(correspondenceAB)[i3], + sep = " ") + } + + CsvNames <- data.frame(CsvNames) + + ##Added condition when CSVout is null (MP) + if (!is.null(CSVout)) { + + pos <- regexpr("\\/[^\\/]*$", CSVout) + Name1 <- substr(CSVout, 1, pos[[1]]) + Name2 <- substr(CSVout, pos[[1]] + 1, nchar(CSVout)) + + pos <- regexpr("\\.[^\\.]*$", Name2) + if (pos[[1]] == -1) { + Name <- substr(Name2, pos[[1]] + 1, nchar(Name2)) + } else { + Name <- substr(Name2, 1, pos[[1]] - 1) + } + } + + colnames(CsvNames) <- paste("Classification:", "Name", sep = " ") + + # Create a data frame that contains the final correspondence table + # (final desired table). + Final <- apply(correspondenceAB, 2, function(x) { + gsub(" ", " ", x) + }) + + + if (is.null(dim(Final))) { + Final <- t(data.frame(Final)) + rownames(Final) <- 1 + } + + + }, error = function(e) { + stop(simpleError(paste("An error has occurred and execution needs to stop. Please check the input data. \n Deatils line 1895:\n",e))) + }) + + # Check so as to write (or not) the final correspondence table (final + # desired table) as well as the names of classifications in two seperate + # csv files. + tryCatch({ + + if (!is.null(CSVout)) { + data.table::fwrite(data.frame(Final, check.names = FALSE), file = CSVout, quote = TRUE) + utils::write.csv(CsvNames, file = paste0(Name1, "classificationNames_", Name2), + row.names = FALSE) + } + + }, error = function(e) { + stop(simpleError("An error occurred while trying to write the output to the specified files. Please check the respective input parameters.")) + }) + + # The final list that contains the final correspondence table (final + # desired table) as a data frame as well as the names of classifications as + # a data frame. + tryCatch({ + + FinalResults <- list() + FinalResults[[1]] <- data.frame(Final, check.names = FALSE, row.names = NULL) + FinalResults[[2]] <- CsvNames + names(FinalResults) <- c("newCorrespondenceTable", "classificationNames") + + # newCorrespondenceTable function returns the final correspondence + # table A:B, that contains the pivot classifications C1, C2, ..., Ck, + # as well as any supplementary information about the classification + # tables A, C1, C2, ..., Ck, B, and the correspondence tables A:C1, + # (C1:C2 - C2:C3), (C2:C3 - C3:C4), ..., (C(k-2):C(k-1) - C(k-1):Ck), + # B:Ck. + + return(FinalResults) + }, error = function(e) { + stop(simpleError(paste("An error has occurred and execution needs to stop. Please check the input data. \n Details line 1946:\n",e))) + }) + +} + diff --git a/R/retrieveClassificationTable.R b/R/retrieveClassificationTable.R index 084c85a..1b39123 100644 --- a/R/retrieveClassificationTable.R +++ b/R/retrieveClassificationTable.R @@ -1,175 +1,196 @@ -#' @title Retrieve classification tables from CELLAR and FAO repositories. -#' @description To facilitate the utilization of European classifications as inputs for the newCorrespondenceTable and updateCorrespondenceTable functions, -#' "retrieveClassificationTable()" utility function has been developed. This utility function leverage R packages that enable SPARQL queries. -#' @param prefix Prefixes are typically defined at the beginning of a SPARQL query and are used throughout the query to make it more concise and easier to read. -#' Multiple prefixes can be defined in a single query to cover different namespaces used in the data set. -#' The function 'classificationEndpoint()' can be used to generate the prefixes for the selected classification table. -#' @param endpoint SPARQL endpoints provide a standardized way to access data sets, -#' making it easier to retrieve specific information or perform complex queries on linked data. -#' The valid values are \code{"CELLAR"} or \code{"FAO"}. -#' @param conceptScheme Refers to a unique identifier associated to specific classification table. -#' The conceptScheme can be obtained by utilizing the "classificationEndpoint()" function. -#' @param level Refers to the hierarchical levels of the selected classification table. -#' The detailed level information can be obtained by utilizing the "structureData() " function. -#' By default is set to \code{"ALL"}. This is an optional argument. -#' @param language Refers to the specific language used for providing label, include and exclude information in the selected classification table. -#' By default is set to "en". This is an optional argument. -#' @param CSVout The valid values are \code{FALSE} or \code{TRUE}. In both cases the classification table as an R object. -#' If output should be saved as a csv file, the argument should be set as \code{TRUE}. By default, no csv file is produced. -#' @param showQuery The valid values are \code{FALSE} or \code{TRUE}. In both cases the classification table as an R object. -#' If needed to view the SPARQL query used, the argument should be set as \code{TRUE}. By default, no SPARQL query is produced. -#' @import httr -#' @export -#' @return -#' \code{retrieveClassificationTable()} returns a classification tables from CELLAR and FAO. The table includes the following variables: -#' \itemize{ -#' \item Classification name (e.g. nace2): the code of each object -#' \item NAME: the corresponding name of each object -#' \item Include: details on each object -#' \item Include_Also: details on each object -#' \item Exclude: details on each object -#' \item URL: the URL from which the SPARQL query was retrieved -#' } -#' @examples -#' { -#' endpoint = "CELLAR" -#' prefix = "nace2" -#' conceptScheme = "nace2" -#' -#' results_ls = retrieveClassificationTable(prefix, endpoint, conceptScheme) -#' -#' # View SPARQL Query -#' cat(results_ls[[1]]) -#' -#' #View Classification Table -#' #View(results_ls[[2]]) -#' } - - - -retrieveClassificationTable = function(prefix, endpoint, conceptScheme, level = "ALL", language = "en", CSVout = FALSE, showQuery=TRUE) { - - ### Define endpoint - if (endpoint == "CELLAR") { - source = "http://publications.europa.eu/webapi/rdf/sparql" - } - if (endpoint == "FAO") { - source = "https://stats.fao.org/caliper/sparql/AllVocs" - } - - ### Load prefixes using prefixList function - prefixlist = prefixList(endpoint, desired_prefix = prefix) - prefixlist = as.character(paste(prefixlist, collapse = "\n")) - - - # # Check if classification has level, if not, set level = "ALL" - dt_level = suppressMessages(dataStructure(prefix, conceptScheme, endpoint, language)) - - if (nrow(dt_level) == 0 & level != "ALL") { - level = "ALL" - message("Classification has no levels, so level = ALL was set to retrieve the table.") - } - - ### Define SPARQL query -- BASE: all levels - SPARQL.query_0 = paste0(prefixlist, " - SELECT DISTINCT ?", prefix, " ?NAME ?Parent ?Include ?Include_Also ?Exclude ?URL - - WHERE { - ?s skos:altLabel ?Label ; - skos:inScheme ?Scheme ; - ^skos:member ?Member ; - #skos:broader ?Broader;. - # skos:altLabel ?Label ; - skos:notation ?notation . - OPTIONAL {?s skos:broader ?Broader. - ?Broader skos:notation ?BT_Notation.} - #FILTER (datatype(?notation) = xsd:string) - BIND (STR(?BT_Notation) as ?Parent) - FILTER (?Scheme = ", prefix, ":", conceptScheme, ") - FILTER (lang(?Label) = '", language, "') - - BIND (STR(?s) AS ?URL) - BIND (STR(?notation) as ?", prefix, " ) - BIND (STR(?Label) as ?NAME) - # BIND (datatype(?notation) AS ?datatype) - - OPTIONAL {?s skos:scopeNote ?Include . FILTER (LANG(?Include) = '", language, "') .} - OPTIONAL {?s xkos:exclusionNote ?Exclude . FILTER (LANG(?Exclude) = '", language, "').} - OPTIONAL {?s xkos:additionalContentNote ?Include_Also . FILTER (LANG(?Include_Also) = '", language, "').} - - ") - - - - - - ### Define SPARQL query -- FILTER LEVEL - SPARQL.query_level = paste0("FILTER (?Member = ", prefix, ":", "division", ")") - - ### End SPARQL query ", prefix - SPARQL.query_end = paste0("} - ORDER BY ?", prefix - ) - - if (length(level) == 0 ){ - stop("Classification level was not specified.") - } else { - if (level == "ALL") { - SPARQL.query = paste0(SPARQL.query_0, SPARQL.query_end) - } else { - SPARQL.query = paste0(SPARQL.query_0, SPARQL.query_level, SPARQL.query_end) - } - - } - response = httr::POST(url = source, accept("text/csv"), body = list(query = SPARQL.query), encode = "form") - data = data.frame(content(response, show_col_types = FALSE)) - - #keep only plainLiteral if more than one datatype // - #FAO - "http://www.w3.org/2001/XMLSchema#string" - #CELLAR - "http://www.w3.org/2001/XMLSchema#string" - "http://www.w3.org/1999/02/22-rdf-syntax-ns#PlainLiteral" - type = unique(data$datatype) - if (length(type) > 1){ - data = data[which(data$datatype == "http://www.w3.org/1999/02/22-rdf-syntax-ns#PlainLiteral"), ] - } - - #remove datatype col - data = data[, 1:(ncol(data)-1)] - - #are there other duplicates? URL is the same and the other changes - xcol = which(colnames(data) == "URL") - dup = length(which(duplicated(data[,-xcol]) == TRUE)) - - if (dup > 0) { - warning("There are duplicates codes in the classification table.") - } - - #Get the good format before we got - data <- lapply(data, function(x) gsub("\n", " ", x)) - data <- as.data.frame(data) - - # Save results as CSV and show where it was stored - if (CSVout == TRUE) { - name_csv = paste0(prefix, "_table.csv") - write.csv(data, file= name_csv, row.names=FALSE) - message(paste0("The table was saved in ", getwd(), name_csv)) - } else if (is.character(CSVout)) { - # if user provide a csv file - write.csv(data, file = CSVout, row.names = FALSE) - message(paste0("The table was saved in ", getwd(), CSVout)) - } - - if (showQuery) { - result=list() - result[[1]]=SPARQL.query - result[[2]]=data - - names(result)=c("SPARQL.query", "ClassificationTable") - cat(result$SPARQL.query, sep ="/n") - } - - if (showQuery==FALSE){ - result=data - } - - return(result) -} +#' @title Retrieve classification tables from CELLAR and FAO repositories. +#' @description To facilitate the utilization of European classifications as inputs for the newCorrespondenceTable and updateCorrespondenceTable functions, +#' "retrieveClassificationTable()" utility function has been developed. This utility function leverage R packages that enable SPARQL queries. +#' @param prefix Prefixes are typically defined at the beginning of a SPARQL query and are used throughout the query to make it more concise and easier to read. +#' Multiple prefixes can be defined in a single query to cover different namespaces used in the data set. +#' The function 'classificationEndpoint()' can be used to generate the prefixes for the selected classification table. +#' @param endpoint SPARQL endpoints provide a standardized way to access data sets, +#' making it easier to retrieve specific information or perform complex queries on linked data. +#' The valid values are \code{"CELLAR"} or \code{"FAO"}. +#' @param conceptScheme Refers to a unique identifier associated to specific classification table. +#' The conceptScheme can be obtained by utilizing the "classificationEndpoint()" function. +#' @param level Refers to the hierarchical levels of the selected classification table. +#' The detailed level information can be obtained by utilizing the "structureData() " function. +#' By default is set to \code{"ALL"}. This is an optional argument. +#' @param language Refers to the specific language used for providing label, include and exclude information in the selected classification table. +#' By default is set to "en". This is an optional argument. +#' @param CSVout The valid value is a valid path to a csv file including file name and extension. By default, no csv file is produced, \code{NULL} +#' If output should be saved as a csv file, the argument should be set as \code{TRUE}. By default, no csv file is produced. +#' @param showQuery The valid values are \code{FALSE} or \code{TRUE}. In both cases the classification table as an R object. +#' If needed to view the SPARQL query used, the argument should be set as \code{TRUE}. By default, no SPARQL query is produced. +#' @param localData this parameter allow the user to retrieve static data from the package in order to avoid any issues from the api +#' @import httr +#' @export +#' @return +#' \code{retrieveClassificationTable()} returns a classification tables from CELLAR and FAO. The table includes the following variables: +#' \itemize{ +#' \item Classification name (e.g. nace2): the code of each object +#' \item NAME: the corresponding name of each object +#' \item Include: details on each object +#' \item Include_Also: details on each object +#' \item Exclude: details on each object +#' \item URL: the URL from which the SPARQL query was retrieved +#' } +#' @examples +#' { +#' endpoint = "CELLAR" +#' prefix = "nace2" +#' conceptScheme = "nace2" +#' +#' results_ls = retrieveClassificationTable(prefix, endpoint, conceptScheme) +#' +#' # View SPARQL Query +#' cat(results_ls[[1]]) +#' +#' #View Classification Table +#' #View(results_ls[[2]]) +#' } +#' + + +retrieveClassificationTable = function(prefix, endpoint, conceptScheme, level = "ALL", language = "en", CSVout = NULL, showQuery = TRUE) { + # Check the useLocalDataForVignettes option + if (getOption("useLocalDataForVignettes", FALSE)) { + localDataPath <- system.file("extdata", paste0(prefix, "_", language, ".csv"), package = "correspondenceTables") + + if (file.exists(localDataPath)) { + # Read data from the local file if it exists + data <- read.csv(localDataPath) + if (showQuery) { + print("Data loaded from local file.") + } + return(data) + } + } else { + ### Define endpoint + if (endpoint == "CELLAR") { + source = "http://publications.europa.eu/webapi/rdf/sparql" + } + if (endpoint == "FAO") { + source = "https://stats.fao.org/caliper/sparql/AllVocs" + } + + ### Load prefixes using prefixList function + prefixlist = prefixList(endpoint, desired_prefix = prefix) + prefixlist = as.character(paste(prefixlist, collapse = "\n")) + + + # # Check if classification has level, if not, set level = "ALL" + dt_level = suppressMessages(dataStructure(prefix, conceptScheme, endpoint, language)) + + if (nrow(dt_level) == 0 & level != "ALL") { + level = "ALL" + message("Classification has no levels, so level = ALL was set to retrieve the table.") + } + + ### Define SPARQL query -- BASE: all levels + SPARQL.query_0 = paste0(prefixlist, " + SELECT DISTINCT ?", prefix, " ?NAME ?Parent ?Include ?Include_Also ?Exclude ?URL + + WHERE { + ?s skos:altLabel ?Label ; + skos:inScheme ?Scheme ; + ^skos:member ?Member ; + #skos:broader ?Broader;. + # skos:altLabel ?Label ; + skos:notation ?notation . + OPTIONAL {?s skos:broader ?Broader. + ?Broader skos:notation ?BT_Notation.} + #FILTER (datatype(?notation) = xsd:string) + BIND (STR(?BT_Notation) as ?Parent) + FILTER (?Scheme = ", prefix, ":", conceptScheme, ") + FILTER (lang(?Label) = '", language, "') + + BIND (STR(?s) AS ?URL) + BIND (STR(?notation) as ?", prefix, " ) + BIND (STR(?Label) as ?NAME) + # BIND (datatype(?notation) AS ?datatype) + + OPTIONAL {?s skos:scopeNote ?Include . FILTER (LANG(?Include) = '", language, "') .} + OPTIONAL {?s xkos:exclusionNote ?Exclude . FILTER (LANG(?Exclude) = '", language, "').} + OPTIONAL {?s xkos:additionalContentNote ?Include_Also . FILTER (LANG(?Include_Also) = '", language, "').} + + ") + + + + + + ### Define SPARQL query -- FILTER LEVEL + SPARQL.query_level = paste0("FILTER (?Member = ", prefix, ":", "division", ")") + + ### End SPARQL query ", prefix + SPARQL.query_end = paste0("} + ORDER BY ?", prefix + ) + + if (length(level) == 0 ){ + stop("Classification level was not specified.") + } else { + if (level == "ALL") { + SPARQL.query = paste0(SPARQL.query_0, SPARQL.query_end) + } else { + SPARQL.query = paste0(SPARQL.query_0, SPARQL.query_level, SPARQL.query_end) + } + + } + tryCatch({ + response = httr::POST(url = source, accept("text/csv"), body = list(query = SPARQL.query), encode = "form") + data = data.frame(content(response, show_col_types = FALSE)) + }, error = function(e) { + stop(message("Error occurred during SPARQL query execution: ", e$message)) + return(NULL) + }) + + #keep only plainLiteral if more than one datatype // + #FAO - "http://www.w3.org/2001/XMLSchema#string" + #CELLAR - "http://www.w3.org/2001/XMLSchema#string" - "http://www.w3.org/1999/02/22-rdf-syntax-ns#PlainLiteral" + type = unique(data$datatype) + if (length(type) > 1){ + data = data[which(data$datatype == "http://www.w3.org/1999/02/22-rdf-syntax-ns#PlainLiteral"), ] + } + + #remove datatype col + data = data[, 1:(ncol(data)-1)] + + #are there other duplicates? URL is the same and the other changes + xcol = which(colnames(data) == "URL") + dup = length(which(duplicated(data[,-xcol]) == TRUE)) + + if (dup > 0) { + warning("There are duplicates codes in the classification table.") + } + + #Get the good format before we got + data <- lapply(data, function(x) gsub("\n", " ", x)) + data <- as.data.frame(data) + + # Save results as CSV and show where it was stored + # if (CSVout == TRUE) { + # name_csv = paste0(prefix,"_", language, ".csv") + # write.csv(data, file= name_csv, row.names=FALSE) + # message(paste0("The table was saved in ", getwd(), name_csv)) + # } else if (is.character(CSVout)) { + # # if user provide a csv file + # write.csv(data, file = CSVout, row.names = FALSE) + # message(paste0("The table was saved in ", getwd(), CSVout)) + # } + CsvFileSave(CSVout, data ) + + if (showQuery) { + result=list() + result[[1]]=SPARQL.query + result[[2]]=data + + names(result)=c("SPARQL.query", "ClassificationTable") + cat(result$SPARQL.query, sep ="/n") + } + + if (showQuery==FALSE){ + result=data + } + + return(result) + } +} + diff --git a/R/retrieveCorrespondenceTable.R b/R/retrieveCorrespondenceTable.R new file mode 100644 index 0000000..e1a4a23 --- /dev/null +++ b/R/retrieveCorrespondenceTable.R @@ -0,0 +1,170 @@ +#' @title Retrieve correspondence tables between statistical classifications from CELLAR and FAO repositories. +#' @description To facilitate the utilization of correspondence tables as inputs for the newCorrespondenceTable and updateCorrespondenceTable functions, +#' "retrieveCorrespondenceTable" utility function has been developed. This utility function leverage R packages that enable SPARQL queries. +#' @param prefix Prefixes are typically defined at the beginning of a SPARQL query +#' and are used throughout the query to make it more concise and easier to read. +#' Multiple prefixes can be defined in a single query to cover different namespaces used in the dataset. +#' The function 'classificationEndpoint()' can be used to generate the prefixes for the selected correspondence table. +#' @param endpoint SPARQL endpoints provide a standardized way to access data sets, +#' making it easier to retrieve specific information or perform complex queries on linked data. +#' The valid values are \code{"CELLAR"} or \code{"FAO"}. +#' @param ID_table Refers to a unique identifier associated with a specific correspondence table. +#' The ID_table can be obtained by utilizing the "correspondenceList()" function. +#' @param language Refers to the specific language used for providing label, include and exclude information in the selected correspondence table. +#' By default is set to "en". This is an optional argument. +#' @param CSVout The valid value is a valid path to a csv file including file name and extension. By default, no csv file is produced, \code{NULL} +#' @param showQuery The valid values are \code{FALSE} or \code{TRUE}. In both cases the correspondence table as an R object. +#' If needed to view the SPARQL query used, the argument should be set as \code{TRUE}. By default, no SPARQL query is produced. +#' @param localData this parameter allow the user to retrieve static data from the package in order to avoid any issues from the api +#' @import httr +#' @export +#' @return +#' \code{retrieveCorrespondenceTable()} returns a classification tables from CELLAR and FAO. The table includes the following variables: +#' \itemize{ +#' \item Source Classification name (e.g. cn2019): the code of each object in the source classification +#' \item Source Classification name (e.g. cn2021): the code of each object in the target classification +#' \item Target Classification label: the corresponding label of each object (e.g. cn2019) +#' \item Include: include details on each object (e.g. cn2019) +#' \item Exclude: details on each object (e.g. cn2019) +#' \item Target Classification label: the corresponding label of each object (e.g. cn2021) +#' \item Include: include details on each object (e.g. cn2021) +#' \item Exclude: details on each object (e.g. cn2021) +#' \item Comment: details on each object, if available +#' \item URL: the URL from which the SPARQL query was retrieved +#' } +#' @examples +#' { +#' endpoint = "CELLAR" +#' prefix = "cn2022" +#' ID_table = "CN2022_NST2007" +#' +#' results_ls = retrieveCorrespondenceTable(prefix, endpoint, ID_table) +#' +#' # View SPARQL Query +#' cat(results_ls[[1]]) +#' +#' #View Classification Table +#' #View(results_ls[[2]]) +#' } + + +retrieveCorrespondenceTable = function(prefix, endpoint, ID_table, language = "en", CSVout = NULL, showQuery = TRUE) { + # Check the useLocalDataForVignettes option + if (getOption("useLocalDataForVignettes", FALSE)) { + localDataPath <- system.file("extdata", paste0(ID_table, "_", language, ".csv"), package = "correspondenceTables") + + if (file.exists(localDataPath)) { + # Read data from the local file if it exists + data <- read.csv(localDataPath) + if (showQuery) { + print("Data loaded from local file.") + } + return(data) + } + } else { + ### Define endpoint + if (endpoint == "CELLAR") { + source = "http://publications.europa.eu/webapi/rdf/sparql" + } + if (endpoint == "FAO") { + source = "https://stats.fao.org/caliper/sparql/AllVocs" + } + + ## Define A and B + ID_table_temp = gsub("-", "_", ID_table) + ID_table_temp = gsub("__", "_", ID_table_temp) + A = sub("_.*", "", ID_table_temp) + B = sub(".*_", "", ID_table_temp) + + ### Load prefixes using prefixList function + prefixlist = prefixList(endpoint, desired_prefix = tolower(c(A,B))) + prefixlist = as.character(paste(prefixlist, collapse = "\n")) + + ### CLASSIFICATION TABLE SPARQL QUERIES + ### Define SPARQL query -- BASE + SPARQL.query_0 = paste0(prefixlist, " + SELECT ?", A ," ?", B ," ?Label_", A ," ?Label_", B ," ?Include_", A ," ?Exclude_", A ," ?Include_", B ," ?Exclude_", B ," ?Comment ?URL ?Sourcedatatype ?Targetdatatype + + WHERE { + ", prefix, ":", ID_table, " xkos:madeOf ?Associations . + ?Associations xkos:sourceConcept ?Source . + OPTIONAL {?Associations xkos:targetConcept ?Target .} + OPTIONAL {?Associations rdfs:comment ?Comment . } + + ?Source skos:notation ?SourceNotation . + ?Target skos:notation ?TargetNotation . + + #FILTER ( datatype(?SourceNotation) = rdf:PlainLiteral) + #FILTER ( datatype(?TargetNotation) = rdf:PlainLiteral) + + BIND (STR(?Associations ) AS ?URL) + BIND (STR(?SourceNotation) as ?", A ,") + BIND (STR(?TargetNotation) as ?", B ,") + BIND (datatype(?SourceNotation) AS ?Sourcedatatype) + BIND (datatype(?TargetNotation) AS ?Targetdatatype) + + OPTIONAL { ?Source skos:altLabel ?Label_", A ," FILTER (LANG(?Label_", A ,") = '", language, "') .} + OPTIONAL { ?Target skos:altLabel ?Label_", B ," FILTER (LANG(?Label_", B ,") = '", language, "') .} + OPTIONAL {?Source skos:scopeNote ?Include_", A ,". FILTER (LANG(?Include_", A ,") = '", language, "') .} + OPTIONAL {?Source xkos:exclusionNote ?Exclude_", A ,". FILTER (LANG(?Exclude_", A ,") = '", language, "') .} + OPTIONAL {?Target skos:scopeNote ?Include_", B ,". FILTER (LANG(?Include_", B ,") = '", language, "') .} + OPTIONAL {?Target xkos:exclusionNote ?Exclude_", B ,". FILTER (LANG(?Exclude_", B ,") = '", language, "') .} + + ") + + ### End SPARQL query ", prefix + SPARQL.query_end = paste0("} + ORDER BY ?Source + ") + + SPARQL.query = paste0(SPARQL.query_0, SPARQL.query_end) + + response = httr::POST(url = source, accept("text/csv"), body = list(query = SPARQL.query), encode = "form") + data = data.frame(content(response, show_col_types = FALSE)) + + + #keep only plainLiteral if more than one datatype // + #FAO - "http://www.w3.org/2001/XMLSchema#string" + #CELLAR - "http://www.w3.org/2001/XMLSchema#string" - "http://www.w3.org/1999/02/22-rdf-syntax-ns#PlainLiteral" + Source_type = unique(data$Sourcedatatype) + Target_type = unique(data$Targetdatatype) + if (length(Source_type) > 1 | length(Target_type) > 1){ + data = data[which(data$Sourcedatatype == "http://www.w3.org/1999/02/22-rdf-syntax-ns#PlainLiteral"), ] + data = data[which(data$Targetdatatype == "http://www.w3.org/1999/02/22-rdf-syntax-ns#PlainLiteral"), ] + } + + #remove datatype col + data = data[, 1:(ncol(data)-2)] + + data <- lapply(data, function(x) gsub("\n", " ", x)) + data <- as.data.frame(data) + + # Save results as CSV and show where it was stored + # if (CSVout == TRUE) { + # name_csv = paste0(ID_table,"_",language,"_table.csv") + # write.csv(data, file= name_csv, row.names=FALSE) + # message(paste0("The correspondence table was saved in ", getwd(), name_csv)) + # } else if (is.character(CSVout)) { + # # if user provide a csv file  + # write.csv(data, file = CSVout, row.names = FALSE) + # message(paste0("The table was saved in ", getwd(), CSVout)) + # } + CsvFileSave(CSVout, data ) + + + } + + if (showQuery) { + result=list() + result[[1]]=SPARQL.query + result[[2]]=data + names(result)=c("SPARQL.query", "CorrespondenceTable") + cat(result$SPARQL.query, sep ="/n") + } + + if (showQuery == FALSE){ + result=data + } + + return(result) +} diff --git a/R/testCsvParameter.R b/R/testCsvParameter.R new file mode 100644 index 0000000..67d1c3b --- /dev/null +++ b/R/testCsvParameter.R @@ -0,0 +1,45 @@ +testCsvParameter <- function(arg_name, arg_value) { + caller <- sys.call(-1) + tryCatch({ + if (is.null(arg_value)) { + # message(paste("Warning in", as.character(caller[1]), ":", arg_name, "argument is NULL. No CSV file will be generated.")) + return() + } + + if (arg_value == FALSE) { + # message(paste("Warning in", as.character(caller[1]), ":", arg_name, "is FALSE. No CSV file will be generated.")) + return() + } + + if (arg_value == TRUE) { + message(paste("CSVout is TRUE. A CSV file will be generated.")) + return(invisible()) # Ne retourne rien + } + + if (arg_value == "") { + message(paste("Warning in", as.character(caller[1]), ":", arg_name, "is empty. No CSV file will be generated.")) + invisible(NULL) + } + + if (!is.character(arg_value)) { + stop(paste(arg_name, "argument must be a character string.")) + } + + # Check if it's a directory path + if (dir.exists(arg_value)) { + stop(paste("File path for", arg_name, "is a directory path missing the file name and extension")) + } + + + if (!grepl("\\.csv$", arg_value)) { + stop(paste("File", arg_name, "does not have the .csv extension.")) + } + + # if (!file.exists(arg_value)) { + # stop(paste("File path for", arg_name, "does not exist.")) + # } + + }, error = function(e) { + stop(message(paste("Error in", as.character(caller[1]), ":", e$message))) + }) +} diff --git a/R/testInputTable.R b/R/testInputTable.R new file mode 100644 index 0000000..0b1f5a0 --- /dev/null +++ b/R/testInputTable.R @@ -0,0 +1,54 @@ +testInputTable <- function(arg_name, arg_value) { + errors <- c() # Initialiser un vecteur pour collecter les messages d'erreur + caller <- sys.call(-1) #define the caller function + + if (missing(arg_value) || length(arg_value) == 0) { + stop(paste("Argument", arg_name, "is not defined or empty")) + } + tryCatch({ + if (is.character(arg_value)) { + if (!file.exists(arg_value)) { + errors <- c(errors, paste("Error in", as.character(caller[1]), ": The file path specified for", arg_name, "does not exist. Please provide a valid file path.")) + } else { + #message(paste("Verification:", arg_name, "file path exists.")) + + if (!grepl("\\.csv$", arg_value)) { + errors <- c(errors, paste("Error in", as.character(caller[1]), ": The file specified for", arg_name, "is not a .csv file. Please ensure the file has a .csv extension.")) + # } else { + # message(paste("Verification:", arg_name, "file format is correct (CSV).")) + } + } + + if (length(errors) == 0) { # Si aucune erreur, essayer de lire le fichier CSV + data <- tryCatch({ + read.csv2(arg_value, header = TRUE, sep = ",") + }, warning = function(w) { + errors <- c(errors, paste("Warning in", as.character(caller[1]), ": Failed to import data from", arg_name, ". The file might be corrupted or improperly formatted.")) + return(invisible()) + }) + + if (exists("data")) { + #message(paste("Success:", arg_name, "has been successfully loaded as a dataframe.")) + return(data) + } + } + + } else if (is.data.frame(arg_value)) { + # message(paste("Success:", arg_name, "is already a dataframe and is ready for immediate use.")) + return(arg_value) + + } else { + errors <- c(errors, paste("Error in", as.character(caller[1]), ":", arg_name, "must be either a valid .csv file path or a dataframe.")) + } + + }, error = function(e) { + errors <- c(errors, paste("Error in", as.character(caller[1]), ":", e$message)) + }) + + if (length(errors) > 0) { + cat("Errors encountered:\n", paste(errors, collapse = "\n"), "\n") + stop("Errors were encountered. Execution halted.") + } + + invisible(NULL) # Use invisible() to suppress the NULL return from being printed +} diff --git a/vignettes/Analyse_CorrespondenceTable.Rmd b/vignettes/Analyse_CorrespondenceTable.Rmd index 408cdd2..ec8965e 100644 --- a/vignettes/Analyse_CorrespondenceTable.Rmd +++ b/vignettes/Analyse_CorrespondenceTable.Rmd @@ -1,67 +1,66 @@ ---- -title: "AnalyseCorrespondenceTable" -description: "This vignette explains how to use the analyseCorrespondenceTable function." -vignette: > - %\VignetteIndexEntry{Analyse classification correspondence tables with the correspondenceTables package} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -## Description - -This vignette explains how to use the `analyseCorrespondenceTable` function to perform analysis on correspondence tables. - -### Package Installation - -To use the `analyseCorrespondenceTable` function, you first need to install and load the `correspondenceTables` package. You can use the following command: - -```{r include=FALSE} -knitr::opts_chunk$set(collapse = TRUE, comment = "#>") -``` - -```{r} -library(correspondenceTables) -``` - -```{r results='hide'} -# Perform analysis -result <- analyseCorrespondenceTable(AB =system.file("extdata", "ExempleAnnexe.csv", package = "correspondenceTables") - , A = NULL, formatA = NULL, B = NULL, formatB = NULL, - CSVcorrespondenceInventory =system.file("extdata", "CorrespondenceInventoryExample.csv", package = "correspondenceTables"), CSVcorrespondenceAnalysis = system.file("extdata", "CorrespondenceAnalysisExample.csv", package = "correspondenceTables")) - -# Print the results -print(result$Annexe_A) -print(result$Annexe_B) - -``` - -For this example we use output **CSVcorrespondenceInventory** **CSVcorrespondenceAnalysis** are not NULL. -Indeed we put our output in these csv files and we can see these with the both print on this function. - - -**Another example** - -Here we use all the function parameters. - -We'll see what happens with AB, which contains data from nace2 & nace2.1. - -A which contains Nace2 as a classification source - -B which contains Nace2.1 as the classification target - -formatA format B will have a numeric value of 1 and 5 here to have the first position and the last position at the lowest level. - -```{r} -result2 <-analyseCorrespondenceTable(AB = (system.file("extdata", "ab_data.csv", package = "correspondenceTables")), - A = (system.file("extdata", "a_data.csv", package = "correspondenceTables")) , - formatA = c(1,5) , - B = B <- (system.file("extdata", "b_data.csv", package = "correspondenceTables")), - formatB = c(1,5), - CSVcorrespondenceInventory = T, CSVcorrespondenceAnalysis = T) - -print(result2$Annexe_A) -###for Annexe_A run this code : View(result2$Annexe_A) -print(result2$Annexe_B) -``` - - +--- +title: "AnalyseCorrespondenceTable" +description: "This vignette explains how to use the analyseCorrespondenceTable function." +vignette: > + %\VignetteIndexEntry{Analyse classification correspondence tables with the correspondenceTables package} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Description + +This vignette explains how to use the `analyseCorrespondenceTable` function to perform analysis on correspondence tables. + +### Package Installation + +To use the `analyseCorrespondenceTable` function, you first need to install and load the `correspondenceTables` package. You can use the following command: + +```{r include=FALSE} +knitr::opts_chunk$set(collapse = TRUE, comment = "#>") +``` + +```{r} +library(correspondenceTables) +``` + +```{r results='hide'} +# Perform analysis +result <- analyseCorrespondenceTable(AB =system.file("extdata", "ExempleAnnexe.csv", package = "correspondenceTables") + , A = NULL, longestAcodeOnly = FALSE, B = NULL, longestBcodeOnly = FALSE, + CSVcorrespondenceInventory = NULL, CSVcorrespondenceAnalysis = NULL) + + + +# Print the results +print(result$Inventory) +print(result$Analysis) + +``` + +For this example we use output **CSVcorrespondenceInventory** **CSVcorrespondenceAnalysis** are not NULL. +Indeed we put our output in these csv files and we can see these with the both print on this function. + + +**Another example** + +Here we use all the function parameters. + +We'll see what happens with AB, which contains data from nace2 & nace2.1. + +A which contains Nace2 as a classification source + +B which contains Nace2.1 as the classification target + +longestAcodeOnly and longestBcodeOnly will be set to TRUE here to have only the retain only the lowest level. + +```{r} +result2 <-analyseCorrespondenceTable(AB = system.file("extdata", "ExempleAnnexe.csv", package = "correspondenceTables"), A = NULL , longestAcodeOnly = FALSE , + B = NULL, + longestBcodeOnly = FALSE, CSVcorrespondenceInventory = NULL, CSVcorrespondenceAnalysis = NULL) + +print(result2$Inventory) +###for Inventory run this code : View(result2$Inventory) +print(result2$Analysis) +``` + + diff --git a/vignettes/Retrieve_classification_and_correspondence_tables.Rmd b/vignettes/Retrieve_classification_and_correspondence_tables.Rmd index 640b9b2..d1c01a1 100644 --- a/vignettes/Retrieve_classification_and_correspondence_tables.Rmd +++ b/vignettes/Retrieve_classification_and_correspondence_tables.Rmd @@ -1,216 +1,238 @@ ---- -title: "Retrieve classification and correspondence tables with the correspondenceTables package" -output: rmarkdown::html_vignette - -vignette: > - %\VignetteIndexEntry{Retrieve classification and correspondence tables with the correspondenceTables package} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r include=FALSE} -knitr::opts_chunk$set(collapse = TRUE, comment = "#>") -``` - -This vignette provides information about the functions included in the correspondenceTables package used to extract classification and correspondence tables from CELLAR and FAO endpoints. - -```{r} -library(correspondenceTables) -``` - -The main two functions used to extract tables are retrieveClassificationTable() and retrieveCorrespondenceTable(), for classification and correspondence tables respectively. Other functions (prefixList(), classificationEndpoint(), dataStructure() and correspondenceList()) are used to understand the structure of each classification or correspondence table and retrieve information needed as input of the two main functions. - -## Extract Classification Tables - -The function retrieveClassificationTable() retrieves the classification table from the CELLAR and FAO repository, which can be used as an input to the newCorrespondenceTable and updateCorrespondenceTable functions. This function has five arguments: - -* prefix - the SPARQL instruction for a declaration of a namespace prefix -* endpoint - the SPARQL Endpoint: CELLAR or FAO -* conceptScheme - a unique identifier associated to specific classification table -* level - levels of the objects in the collection to be retrieved -* language - language of the table -* CSVout - preferred choice of output -* showQuery - the SPARQL query generated - -The values of the prefix, conceptScheme and level can be found using the classificationEndpoint() and dataStructure() functions (see below). -The arguments level, language, CSVout and showQuery are optional and set as default as level="ALL", language="en", CSVout="FALSE" and showQuery="FALSE". - -The namespace prefix needed to execute the SPARQL query are automatically generated using the prefixList() function (see below). - -**Application of function retrieveClassificationTable()** - -The following code is used to retrieve the NACE2 classification from CELLAR: - -```{r, results = "hide"} -endpoint = "CELLAR" -# obtain prefix and conceptscheme using classificationEndpoint() -classificationEndpoint(endpoint)[[1]][,1:4] -prefix = "nace21" -conceptScheme = "nace2.1" -## You can modify the language by example "en" "bg" -language = "en" -CSVout = F -showQuery = TRUE -level = "ALL" -# identify lower level using dataStructure() -# level_dt = dataStructure(prefix, conceptScheme, endpoint, language) -# level = level_dt[nrow(level_dt),2] -# level_dt; level - -result_ls = retrieveClassificationTable(prefix, endpoint, conceptScheme, level , language, CSVout, showQuery) - -# identify the SPARQL query used -SPARQLquery = result_ls[[1]] -### cat put the break line for the sparql query -cat(SPARQLquery, sep = "\n") -# obtained the table -table = result_ls[[2]] - -``` - - - -### Other functions used to extract Classification Tables - -Following, the other functions used to extract the required information for the retrieveClassificationTable() function. - -**Application of function prefixList()** - -This function automatically obtains a list the prefixes from CELLAR and FAO used in the retrieveClassificationTable() and retrieveCorrespondenceTable() functions to retrieve the required tables. The list will be updated automatically each time a new classification or correspondence classification is introduced in CELLAR or FAO repository. - -```{r, results = "hide"} -endpoint = "CELLAR" -prefixList(endpoint) -``` - -**Application of function classificationEndpoint()** - -The function returns a table with information needed to retrieve the classification table: -\itemize{ - \item Prefix name: the SPARQL instruction for a declaration of a namespace prefix - \item Conceptscheme: a unique identifier associated to specific classification table - \item URI: the URL from which the SPARQL query was retrieved - \item Name: the name of the table retrieved -} - -```{r, results = "hide"} - endpoint = "ALL" - list_data = classificationEndpoint(endpoint) -``` - - -**Application of function dataStructure()** - -Each classification has a different level based on their structure. The function dataStructure() retrieves information about the level names, their hierarchy and the numbers of records for all the classification available in the repositories (CELLAR and FAO). The level information needed as an optional arguments of the retrieveClassificationTable() function. - -The function returns a table with the following columns: -\itemize{ - \item Concept_Scheme: a unique identifier associated to specific classification table - \item Level: the levels of the objects in the collection - \item Depth: identify the hierarchy of each level - \item Count: the number of objects retrieved in each level -} - -The following code produce a list including the structure of each classification available in CELLAR and FAO. -```{r,results = "hide"} -## Obtain the structure of all classifications -## CELLAR -# data_CELLAR = list() -# endpoint = "CELLAR" -# #Get info to retrieve structure using classificationEndpoint() -# list_data = classificationEndpoint("ALL") - -# #loop over all classification in CELLAR -# for (i in 1:nrow(list_data$CELLAR)){ -# prefix = list_data$CELLAR[i,1] -# conceptScheme = list_data$CELLAR[i,2] -# #language by default is English -# data_CELLAR[[i]] = dataStructure(prefix, conceptScheme, endpoint) -# } -# names(data_CELLAR) = list_data$CELLAR[,1] - -## FAO -# data_FAO = list() -# endpoint = "FAO" -# for (i in 1:nrow(list_data$FAO)){ -# prefix = list_data$FAO[i,1] -# conceptScheme = list_data$FAO[i,2] -# data_FAO[[i]] = dataStructure(prefix, conceptScheme, endpoint) -# } -# names(data_FAO) = list_data$FAO[,1] -``` - -## Extract Correspondence Tables -The function retrieveCorrespondenceTable() retrieves the correspondences table from the CELLAR and FAO repository, which can be used as an input to the newCorrespondenceTable and updateCorrespondenceTable functions. This function has four arguments: - -* prefix - the SPARQL instruction for a declaration of a namespace prefix -* endpoint - the SPARQL Endpoint: CELLAR or FAO -* ID_table - the ID of the correspondence table -* language - language of the table -* CSVout - preferred choice of output -* showQuery - the SPARQL query generated - -The values of the prefix and ID_table can be found using the correspondenceList() function (see below). -The argument language, CSVout and showQuery are optional and set as default as language="en", CSVout="FALSE" and showQuery="TRUE". - -The namespace prefix needed to execute the SPARQL query are automatically generated using the prefixList() function (see above). - -**Application of function retrieveCorrespondenceTable()** - -To following code is used to retrieve the correspondence table between NACE2 and CPA21 from CELLAR: -```{r, results = "hide"} - -endpoint = "CELLAR" -### correspondenceList -correspondenceList(endpoint) -prefix = "nace2" -ID_table = "NACE2_CPA21" -language = "en" -CSVout = FALSE -showQuery= TRUE - -result_ls = retrieveCorrespondenceTable(prefix, endpoint, ID_table, language, CSVout, showQuery) - -# identify the SPARQL query used -SPARQLquery = result_ls[[1]] -cat(SPARQLquery, sep = "\n") -# obtained the table -table = result_ls[[2]] -``` - -To following code is used to retrieve a specific correspondence table between CPCv21 and ISIC4 from FAO: -```{r, results = "hide"} -# endpoint = "FAO" -# prefix = "CPCv21" -# ID_table = "CPC21-ISIC4" -# language = "en" -# CSVout = FALSE -# showQuery= TRUE -# -# result_ls = retrieveCorrespondenceTable(prefix, endpoint, ID_table, language, CSVout, showQuery) -# -# # identify the SPARQL query used -# SPARQLquery = result_ls[[1]] -# -# # obtained the table -# table = result_ls[[2]] -``` - -### Other functions used to extract Correspondence Tables -Following, the function used to extract the required information for the retrieveCorrespondenceTable() function. - - -**Application of function correspondenceList()** - -The correspondenceList() provides an overview of all the available correspondence classification from CELLAR and FAO repository. - -```{r, message=FALSE, warning=FALSE, results = "hide"} -corr_list = correspondenceList("ALL") - -#provides a table with all the correspondence tables for NACE2 in CELLAR -corr_list$CELLAR$`nace2:` - -#provides a table with all the correspondence tables for CPC v21 in FAO -# corr_list$FAO$`CPCv21:` -``` - +--- +title: "Retrieve classification and correspondence tables with the correspondenceTables package" +output: rmarkdown::html_vignette + +vignette: > + %\VignetteIndexEntry{Retrieve classification and correspondence tables with the correspondenceTables package} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r include=FALSE} +knitr::opts_chunk$set(collapse = TRUE, comment = "#>") +``` + +This vignette provides information about the functions included in the correspondenceTables package used to extract classification and correspondence tables from CELLAR and FAO endpoints. + +```{r} +library(correspondenceTables) +``` + +The main two functions used to extract tables are retrieveClassificationTable() and retrieveCorrespondenceTable(), for classification and correspondence tables respectively. Other functions (prefixList(), classificationEndpoint(), dataStructure() and correspondenceList()) are used to understand the structure of each classification or correspondence table and retrieve information needed as input of the two main functions. + +## Extract Classification Tables + +The function retrieveClassificationTable() retrieves the classification table from the CELLAR and FAO repository, which can be used as an input to the newCorrespondenceTable and updateCorrespondenceTable functions. This function has five arguments: + +* prefix - the SPARQL instruction for a declaration of a namespace prefix +* endpoint - the SPARQL Endpoint: CELLAR or FAO +* conceptScheme - a unique identifier associated to specific classification table +* level - levels of the objects in the collection to be retrieved +* language - language of the table +* CSVout - preferred choice of output +* showQuery - the SPARQL query generated + +The values of the prefix, conceptScheme and level can be found using the classificationEndpoint() and dataStructure() functions (see below). +The arguments level, language, CSVout and showQuery are optional and set as default as level="ALL", language="en", CSVout="NULL" and showQuery="FALSE". + +The namespace prefix needed to execute the SPARQL query are automatically generated using the prefixList() function (see below). + +**Application of function retrieveClassificationTable()** + +The following code is used to retrieve the german,french and bulgarian NACE2 classification from static data: + +```{r, results = "hide"} +endpoint = "CELLAR" +# obtain prefix and conceptscheme using classificationEndpoint() +classificationEndpoint(endpoint)[[1]][,1:4] +prefix = "nace2" +conceptScheme = "nace2" +## You can modify the language by example "en" "bg" +language = "en" +CSVout = NULL +showQuery = TRUE +level = "ALL" +# identify lower level using dataStructure() +# level_dt = dataStructure(prefix, conceptScheme, endpoint, language) +# level = level_dt[nrow(level_dt),2] +# level_dt; level +options(useLocalDataForVignettes = TRUE) +# result_ls = retrieveClassificationTable(prefix, endpoint, conceptScheme, level , language, CSVout, showQuery) +result_de <-retrieveClassificationTable(prefix = "nace2", endpoint = "CELLAR", conceptScheme = "nace2", language = "de",CSVout=NULL) + +result_fr <-retrieveClassificationTable(prefix = "nace2", endpoint = "CELLAR", conceptScheme = "nace2", language = "fr",CSVout=NULL) + +result_bg <-retrieveClassificationTable(prefix = "nace2", endpoint = "CELLAR", conceptScheme = "nace2", language = "bg",CSVout=NULL) + +result_de +result_fr +result_bg +# # identify the SPARQL query used +# SPARQLquery = result_ls[[1]] +# ### cat put the break line for the sparql query +# cat(SPARQLquery, sep = "\n") +# # obtained the table +# table = result_ls[[2]] +``` + + + +### Other functions used to extract Classification Tables + +Following, the other functions used to extract the required information for the retrieveClassificationTable() function. + +**Application of function prefixList()** + +This function automatically obtains a list the prefixes from CELLAR and FAO used in the retrieveClassificationTable() and retrieveCorrespondenceTable() functions to retrieve the required tables. The list will be updated automatically each time a new classification or correspondence classification is introduced in CELLAR or FAO repository. + +```{r, results = "hide"} +endpoint = "CELLAR" +prefixList(endpoint) +``` + +**Application of function classificationEndpoint()** + +The function returns a table with information needed to retrieve the classification table: +\itemize{ + \item Prefix name: the SPARQL instruction for a declaration of a namespace prefix + \item Conceptscheme: a unique identifier associated to specific classification table + \item URI: the URL from which the SPARQL query was retrieved + \item Name: the name of the table retrieved +} + +```{r, results = "hide"} + endpoint = "ALL" + list_data = classificationEndpoint(endpoint) +``` + + +**Application of function dataStructure()** + +Each classification has a different level based on their structure. The function dataStructure() retrieves information about the level names, their hierarchy and the numbers of records for all the classification available in the repositories (CELLAR and FAO). The level information needed as an optional arguments of the retrieveClassificationTable() function. + +The function returns a table with the following columns: +\itemize{ + \item Concept_Scheme: a unique identifier associated to specific classification table + \item Level: the levels of the objects in the collection + \item Depth: identify the hierarchy of each level + \item Count: the number of objects retrieved in each level +} + +The following code produce a list including the structure of each classification available in CELLAR and FAO. +```{r,results = "hide"} +## Obtain the structure of all classifications +## CELLAR +# data_CELLAR = list() +# endpoint = "CELLAR" +# #Get info to retrieve structure using classificationEndpoint() +# list_data = classificationEndpoint("ALL") + +# #loop over all classification in CELLAR +# for (i in 1:nrow(list_data$CELLAR)){ +# prefix = list_data$CELLAR[i,1] +# conceptScheme = list_data$CELLAR[i,2] +# #language by default is English +# data_CELLAR[[i]] = dataStructure(prefix, conceptScheme, endpoint) +# } +# names(data_CELLAR) = list_data$CELLAR[,1] + +## FAO +# data_FAO = list() +# endpoint = "FAO" +# for (i in 1:nrow(list_data$FAO)){ +# prefix = list_data$FAO[i,1] +# conceptScheme = list_data$FAO[i,2] +# data_FAO[[i]] = dataStructure(prefix, conceptScheme, endpoint) +# } +# names(data_FAO) = list_data$FAO[,1] +``` + +## Extract Correspondence Tables +The function retrieveCorrespondenceTable() retrieves the correspondences table from the CELLAR and FAO repository, which can be used as an input to the newCorrespondenceTable and updateCorrespondenceTable functions. This function has four arguments: + +* prefix - the SPARQL instruction for a declaration of a namespace prefix +* endpoint - the SPARQL Endpoint: CELLAR or FAO +* ID_table - the ID of the correspondence table +* language - language of the table +* CSVout - preferred choice of output +* showQuery - the SPARQL query generated + +The values of the prefix and ID_table can be found using the correspondenceList() function (see below). +The argument language, CSVout and showQuery are optional and set as default as language="en", CSVout="FALSE" and showQuery="TRUE". + +The namespace prefix needed to execute the SPARQL query are automatically generated using the prefixList() function (see above). + +**Application of function retrieveCorrespondenceTable()** + +To following code is used to retrieve the French German and Bulgarian correspondence table between NACE2 and CPA21 from static data : +```{r, results = "hide"} + +# endpoint = "CELLAR" +# ### correspondenceList +# correspondenceList(endpoint) +# prefix = "nace2" +# ID_table = "NACE2_CPA21" +# language = "en" +# CSVout = FALSE +# showQuery= TRUE +# +# result_ls = retrieveCorrespondenceTable(prefix, endpoint, ID_table, language, CSVout, showQuery) +# +# # identify the SPARQL query usedN +# SPARQLquery = result_ls[[1]] +# cat(SPARQLquery, sep = "\n") +# # obtained the table +# table = result_ls[[2]] +options(useLocalDataForVignettes = TRUE) +result_cor_bg<- retrieveCorrespondenceTable(prefix = "nace2", endpoint = "CELLAR", ID_table = "NACE2_CPA21",language = "bg", + CSVout = FALSE) + +result_cor_fr<- retrieveCorrespondenceTable(prefix = "nace2", endpoint = "CELLAR", ID_table = "NACE2_CPA21",language = "fr", + CSVout = FALSE) + +result_cor_de<- retrieveCorrespondenceTable(prefix = "nace2", endpoint = "CELLAR", ID_table = "NACE2_CPA21",language = "de", + CSVout = FALSE) + + + +result_cor_bg +result_cor_fr +result_cor_de +``` + +To following code is used to retrieve a specific correspondence table between CPCv21 and ISIC4 from FAO: +```{r, results = "hide"} +# endpoint = "FAO" +# prefix = "CPCv21" +# ID_table = "CPC21-ISIC4" +# language = "en" +# CSVout = NULL +# showQuery= TRUE +# +# result_ls = retrieveCorrespondenceTable(prefix, endpoint, ID_table, language, CSVout, showQuery) +# +# # identify the SPARQL query used +# SPARQLquery = result_ls[[1]] +# +# # obtained the table +# table = result_ls[[2]] +``` + +### Other functions used to extract Correspondence Tables +Following, the function used to extract the required information for the retrieveCorrespondenceTable() function. + + +**Application of function correspondenceList()** + +The correspondenceList() provides an overview of all the available correspondence classification from CELLAR and FAO repository. + +```{r, message=FALSE, warning=FALSE, results = "hide"} +corr_list = correspondenceList("ALL") + +#provides a table with all the correspondence tables for NACE2 in CELLAR +corr_list$CELLAR$`nace2:` + +#provides a table with all the correspondence tables for CPC v21 in FAO +# corr_list$FAO$`CPCv21:` +``` + diff --git a/vignettes/VignetteAggregateClassificationQC.Rmd b/vignettes/VignetteAggregateClassificationQC.Rmd index 89889ba..87a2f04 100644 --- a/vignettes/VignetteAggregateClassificationQC.Rmd +++ b/vignettes/VignetteAggregateClassificationQC.Rmd @@ -1,62 +1,62 @@ ---- -title: "Vignette_Aggregate_CorrespondenceTable" -output: html_document - - -vignette: > - %\VignetteIndexEntry{Aggregate classification between A & B with the correspondenceTables package} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - - - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -This vignette provides information about the functions included in the correspondenceTables package used to create an aggregation between two classifications - -Load the package correspondenceTables : -```{r cars} -library(correspondenceTables) -``` - -The `aggregateCorrespondenceTable` function in R is designed to aggregate correspondence tables between two hierarchical classifications. It facilitates the process of expressing statistics compiled at different levels in classification A to corresponding levels in classification B. The function output is mechanically defined and provides candidate aggregations for subsequent analysis by statistical classification experts. - -The function expects three mandatory CSV files (AB, A, and B) as input. The structure of these files is crucial. - -- **AB**: Input correspondence table -- **A**: Source classification table -- **B**: Target classification table -- **CSVOUT** Retrieve the result as a csv - - -**Application of the aggregateCorrespondenceTable()** - -```{r, results = "hide"} - AB <- (system.file("extdata", "ab_data.csv", package = "correspondenceTables")) - A <- (system.file("extdata", "a_data.csv", package = "correspondenceTables")) - B <- (system.file("extdata", "b_data.csv", package = "correspondenceTables")) - - - result <- aggregateCorrespondenceTable(AB = AB, A = A, B = B, CSVout = FALSE) - - print(result) - -``` - -**Other Example** - -```{r, results = "hide"} - - AB <- (system.file("extdata", "ab_data.csv", package = "correspondenceTables")) - A <- (system.file("extdata", "a_data.csv", package = "correspondenceTables")) - B <- (system.file("extdata", "b_data.csv", package = "correspondenceTables")) - - - result <- aggregateCorrespondenceTable(AB = AB, A = A, B = B, CSVout = TRUE) - - print(result) -``` - +--- +title: "Vignette_Aggregate_CorrespondenceTable" +output: html_document + + +vignette: > + %\VignetteIndexEntry{Aggregate classification between A & B with the correspondenceTables package} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +This vignette provides information about the functions included in the correspondenceTables package used to create an aggregation between two classifications + +Load the package correspondenceTables : +```{r cars} +library(correspondenceTables) +``` + +The `aggregateCorrespondenceTable` function in R is designed to aggregate correspondence tables between two hierarchical classifications. It facilitates the process of expressing statistics compiled at different levels in classification A to corresponding levels in classification B. The function output is mechanically defined and provides candidate aggregations for subsequent analysis by statistical classification experts. + +The function expects three mandatory CSV files (AB, A, and B) as input. The structure of these files is crucial. + +- **AB**: Input correspondence table +- **A**: Source classification table +- **B**: Target classification table +- **CSVOUT** Retrieve the result as a csv + + +**Application of the aggregateCorrespondenceTable()** + +```{r, results = "hide"} + AB <- (system.file("extdata", "ab_data.csv", package = "correspondenceTables")) + A <- (system.file("extdata", "a_data.csv", package = "correspondenceTables")) + B <- (system.file("extdata", "b_data.csv", package = "correspondenceTables")) + + + result <- aggregateCorrespondenceTable(AB = AB, A = A, B = B, CSVout = NULL) + + print(result) + +``` + +**Other Example** + +```{r, results = "hide"} + + AB <- (system.file("extdata", "ab_data.csv", package = "correspondenceTables")) + A <- (system.file("extdata", "a_data.csv", package = "correspondenceTables")) + B <- (system.file("extdata", "b_data.csv", package = "correspondenceTables")) + + + result <- aggregateCorrespondenceTable(AB = AB, A = A, B = B, CSVout = NULL) + + print(result) +``` + diff --git a/vignettes/VignetteCorrectionAndLenghtsFile.Rmd b/vignettes/VignetteCorrectionAndLenghtsFile.Rmd index 37cf51b..8550d8b 100644 --- a/vignettes/VignetteCorrectionAndLenghtsFile.Rmd +++ b/vignettes/VignetteCorrectionAndLenghtsFile.Rmd @@ -1,144 +1,146 @@ ---- -title: "CorrectionClassification" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{CorrectionClassification & LengthsFile} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -Introduction -This vignette provides information about the functions correctionClassification and lengthsFile included in the correspondenceTables package. These functions are designed to assist in retrieving classification tables from CELLAR and FAO repositories. -```{r} -library(correspondenceTables) -``` - - -**CorrectionClassification Function** - -The correctionClassification function is used to correct the classification codes based on specific rules for each classification. It takes the following parameter: - -Classification - Code name (e.g. nace2): the code of each object -Classification Label - corresponding name of each object - - -**Application of function CorrectionClassification ** - -To following code is used to corrects the classification table by adding necessary prefixes or removing unwanted characters. -```{r} -prefix = "nace2" -conceptScheme = "nace2" -endpoint = "CELLAR" -classification = retrieveClassificationTable(prefix, endpoint, conceptScheme,level="ALL")$ClassificationTable -classification = classification[,c(1,2)] -colnames(classification)[1:2] = c("Code", "Label") -``` - -These different code show the different correction for each classification. - -**Correction for (NACE - NACE 2.1 - CPA21 - and ISIC) add a letter ** -Letter addition for NACE, NACE 2.1, CPA21, and ISIC: - -For each classification (NACE, NACE 2.1, CPA21, and ISIC), specific code ranges are identified using the substr function. Then, a letter is added to the corresponding code values in the classification$Code column. For example, codes starting with "01" or "02" or "03" are assigned the letter "A", codes starting with "05" or "06" or "07" or "08" or "09" are assigned the letter "B" and for the other number we input different letters from alphabet -```{r} - if (prefix %in% c("nace2", "nace21", "cpa21", "ISICrev4")) { - A_code = which(substr(classification$Code, 1, 2) %in% c("01", "02", "03")) - classification$Code[A_code] = paste0("A", classification$Code[A_code]) - B_code = which(substr(classification$Code, 1, 2) %in% c("05", "06", "07", "08", "09")) - classification$Code[B_code] = paste0("B", classification$Code[B_code]) - C_code = which(substr(classification$Code, 1, 2) %in% c("10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", - "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33")) - classification$Code[C_code] = paste0("C", classification$Code[C_code]) - D_code = which(substr(classification$Code, 1, 2) %in% c("35")) - classification$Code[D_code] = paste0("D", classification$Code[D_code]) - E_code = which(substr(classification$Code, 1, 2) %in% c("36", "37", "38", "39")) - classification$Code[E_code] = paste0("E", classification$Code[E_code]) - F_code = which(substr(classification$Code, 1, 2) %in% c("41", "42", "43")) - classification$Code[F_code] = paste0("F", classification$Code[F_code]) - G_code = which(substr(classification$Code, 1, 2) %in% c("45", "46", "47")) - classification$Code[G_code] = paste0("G", classification$Code[G_code]) - H_code = which(substr(classification$Code, 1, 2) %in% c("49", "50", "51", "52", "53")) - classification$Code[H_code] = paste0("H", classification$Code[H_code]) - I_code = which(substr(classification$Code, 1, 2) %in% c("55", "56")) - classification$Code[I_code] = paste0("I", classification$Code[I_code]) - J_code = which(substr(classification$Code, 1, 2) %in% c("58", "59", "60", "61", "62", "63")) - classification$Code[J_code] = paste0("J", classification$Code[J_code]) - K_code = which(substr(classification$Code, 1, 2) %in% c("64", "65", "66")) - classification$Code[K_code] = paste0("K", classification$Code[K_code]) - L_code = which(substr(classification$Code, 1, 2) %in% c("68")) - classification$Code[L_code] = paste0("L", classification$Code[L_code]) - M_code = which(substr(classification$Code, 1, 2) %in% c("69", "70", "71", "72", "73", "74", "75")) - classification$Code[M_code] = paste0("M", classification$Code[M_code]) - N_code = which(substr(classification$Code, 1, 2) %in% c("77", "78", "79", "80", "81", "82")) - classification$Code[N_code] = paste0("N", classification$Code[N_code]) - O_code = which(substr(classification$Code, 1, 2) %in% c("84")) - classification$Code[O_code] = paste0("O", classification$Code[O_code]) - P_code = which(substr(classification$Code, 1, 2) %in% c("85")) - classification$Code[P_code] = paste0("P", classification$Code[P_code]) - Q_code = which(substr(classification$Code, 1, 2) %in% c("86", "87", "88")) - classification$Code[Q_code] = paste0("Q", classification$Code[Q_code]) - R_code = which(substr(classification$Code, 1, 2) %in% c("90", "91", "92", "93")) - classification$Code[R_code] = paste0("R", classification$Code[R_code]) - S_code = which(substr(classification$Code, 1, 2) %in% c("94", "95", "96")) - classification$Code[S_code] = paste0("S", classification$Code[S_code]) - T_code = which(substr(classification$Code, 1, 2) %in% c("97", "98")) - classification$Code[T_code] = paste0("T", classification$Code[T_code]) - U_code = which(substr(classification$Code, 1, 2) %in% c("99")) - classification$Code[U_code] = paste0("U", classification$Code[U_code]) - } -``` - - -**Remove .0 for 10, 11 and 12 division for ecoicop** - - -If the prefix is "ecoicop" and the code values in the classification$Code column are "10.0", "11.0", or "12.0", they are replaced with "10", "11", and "12", respectively. -```{r} - if (prefix %in% c("ecoicop")) { - level1_code = which(classification$Code %in% c("10.0", "11.0", "12.0")) - classification$Code[level1_code] = c("10", "11", "12") - } - -``` - -**remove weird code 00.99.t and 00.99.t for prodcom2019** - -Remove weird codes for prodcom2019: - -If the prefix is "prodcom2019" and the code values in the classification$Code column are "00.99.t" or "00.99.z", those rows are removed from the classification data frame. -```{r} - - if (prefix %in% c("prodcom2019")) { - level1_code = which(classification$Code %in% c("00.99.t", "00.99.z")) - classification = classification[-level1_code,] - } - -``` - -**remove section for CN** - -If the prefix is one of "cn2017", "cn2018", "cn2019", "cn2020", "cn2021", "cn2021", "cn2022", or "cn2023", sections are identified based on the presence of alphabetic characters in the classification Code column n. Rows containing sections are then removed from the classification data frame.s -```{r} - if (prefix %in% c("cn2017", "cn2018", "cn2019", "cn2020", "cn2021", "cn2021", "cn2022", "cn2023")) { - level1_code = which(gsub("[^a-zA-Z]", "", classification$Code)!= "") - if (length(level1_code)> 0){ - classification = classification[-level1_code,] - } - } -``` - -**remove "." in the end of the code for CBF** - -If the prefix is "cbf10", the last character (".") is removed from the code values in the classification Code column. -```{r} -if (prefix %in% c("cbf10")) { - classification[,1] = substr(classification[,1], 1, nchar(classification[,1])-1) - } -``` +--- +title: "CorrectionClassification" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{CorrectionClassification & LengthsFile} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +Introduction +This vignette provides information about the functions correctionClassification and lengthsFile included in the correspondenceTables package. These functions are designed to assist in retrieving classification tables from CELLAR and FAO repositories. +```{r} +library(correspondenceTables) +``` + + +**CorrectionClassification Function** + +The correctionClassification function is used to correct the classification codes based on specific rules for each classification. It takes the following parameter: + +Classification - Code name (e.g. nace2): the code of each object +Classification Label - corresponding name of each object + + +**Application of function CorrectionClassification ** + +To following code is used to corrects the classification table by adding necessary prefixes or removing unwanted characters. +```{r} +prefix = "nace2" +conceptScheme = "nace2" +endpoint = "CELLAR" +language = "en" +options(useLocalDataForVignettes = TRUE) +classification = retrieveClassificationTable(prefix,endpoint,conceptScheme,language="en",level="ALL") +colnames(classification)[1:2] = c("Code", "Label") +classification +``` + +These different code show the different correction for each classification. + +**Correction for (NACE - NACE 2.1 - CPA21 - and ISIC) add a letter ** +Letter addition for NACE, NACE 2.1, CPA21, and ISIC: + +For each classification (NACE, NACE 2.1, CPA21, and ISIC), specific code ranges are identified using the substr function. Then, a letter is added to the corresponding code values in the classification$Code column. For example, codes starting with "01" or "02" or "03" are assigned the letter "A", codes starting with "05" or "06" or "07" or "08" or "09" are assigned the letter "B" and for the other number we input different letters from alphabet +```{r} + if (prefix %in% c("nace2", "nace21", "cpa21", "ISICrev4")) { + A_code = which(substr(classification$Code, 1, 2) %in% c("01", "02", "03")) + classification$Code[A_code] = paste0("A", classification$Code[A_code]) + B_code = which(substr(classification$Code, 1, 2) %in% c("05", "06", "07", "08", "09")) + classification$Code[B_code] = paste0("B", classification$Code[B_code]) + C_code = which(substr(classification$Code, 1, 2) %in% c("10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", + "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33")) + classification$Code[C_code] = paste0("C", classification$Code[C_code]) + D_code = which(substr(classification$Code, 1, 2) %in% c("35")) + classification$Code[D_code] = paste0("D", classification$Code[D_code]) + E_code = which(substr(classification$Code, 1, 2) %in% c("36", "37", "38", "39")) + classification$Code[E_code] = paste0("E", classification$Code[E_code]) + F_code = which(substr(classification$Code, 1, 2) %in% c("41", "42", "43")) + classification$Code[F_code] = paste0("F", classification$Code[F_code]) + G_code = which(substr(classification$Code, 1, 2) %in% c("45", "46", "47")) + classification$Code[G_code] = paste0("G", classification$Code[G_code]) + H_code = which(substr(classification$Code, 1, 2) %in% c("49", "50", "51", "52", "53")) + classification$Code[H_code] = paste0("H", classification$Code[H_code]) + I_code = which(substr(classification$Code, 1, 2) %in% c("55", "56")) + classification$Code[I_code] = paste0("I", classification$Code[I_code]) + J_code = which(substr(classification$Code, 1, 2) %in% c("58", "59", "60", "61", "62", "63")) + classification$Code[J_code] = paste0("J", classification$Code[J_code]) + K_code = which(substr(classification$Code, 1, 2) %in% c("64", "65", "66")) + classification$Code[K_code] = paste0("K", classification$Code[K_code]) + L_code = which(substr(classification$Code, 1, 2) %in% c("68")) + classification$Code[L_code] = paste0("L", classification$Code[L_code]) + M_code = which(substr(classification$Code, 1, 2) %in% c("69", "70", "71", "72", "73", "74", "75")) + classification$Code[M_code] = paste0("M", classification$Code[M_code]) + N_code = which(substr(classification$Code, 1, 2) %in% c("77", "78", "79", "80", "81", "82")) + classification$Code[N_code] = paste0("N", classification$Code[N_code]) + O_code = which(substr(classification$Code, 1, 2) %in% c("84")) + classification$Code[O_code] = paste0("O", classification$Code[O_code]) + P_code = which(substr(classification$Code, 1, 2) %in% c("85")) + classification$Code[P_code] = paste0("P", classification$Code[P_code]) + Q_code = which(substr(classification$Code, 1, 2) %in% c("86", "87", "88")) + classification$Code[Q_code] = paste0("Q", classification$Code[Q_code]) + R_code = which(substr(classification$Code, 1, 2) %in% c("90", "91", "92", "93")) + classification$Code[R_code] = paste0("R", classification$Code[R_code]) + S_code = which(substr(classification$Code, 1, 2) %in% c("94", "95", "96")) + classification$Code[S_code] = paste0("S", classification$Code[S_code]) + T_code = which(substr(classification$Code, 1, 2) %in% c("97", "98")) + classification$Code[T_code] = paste0("T", classification$Code[T_code]) + U_code = which(substr(classification$Code, 1, 2) %in% c("99")) + classification$Code[U_code] = paste0("U", classification$Code[U_code]) + } +``` + + +**Remove .0 for 10, 11 and 12 division for ecoicop** + + +If the prefix is "ecoicop" and the code values in the classification$Code column are "10.0", "11.0", or "12.0", they are replaced with "10", "11", and "12", respectively. +```{r} + if (prefix %in% c("ecoicop")) { + level1_code = which(classification$Code %in% c("10.0", "11.0", "12.0")) + classification$Code[level1_code] = c("10", "11", "12") + } + +``` + +**remove weird code 00.99.t and 00.99.t for prodcom2019** + +Remove weird codes for prodcom2019: + +If the prefix is "prodcom2019" and the code values in the classification$Code column are "00.99.t" or "00.99.z", those rows are removed from the classification data frame. +```{r} + + if (prefix %in% c("prodcom2019")) { + level1_code = which(classification$Code %in% c("00.99.t", "00.99.z")) + classification = classification[-level1_code,] + } + +``` + +**remove section for CN** + +If the prefix is one of "cn2017", "cn2018", "cn2019", "cn2020", "cn2021", "cn2021", "cn2022", or "cn2023", sections are identified based on the presence of alphabetic characters in the classification Code column n. Rows containing sections are then removed from the classification data frame.s +```{r} + if (prefix %in% c("cn2017", "cn2018", "cn2019", "cn2020", "cn2021", "cn2021", "cn2022", "cn2023")) { + level1_code = which(gsub("[^a-zA-Z]", "", classification$Code)!= "") + if (length(level1_code)> 0){ + classification = classification[-level1_code,] + } + } +``` + +**remove "." in the end of the code for CBF** + +If the prefix is "cbf10", the last character (".") is removed from the code values in the classification Code column. +```{r} +if (prefix %in% c("cbf10")) { + classification[,1] = substr(classification[,1], 1, nchar(classification[,1])-1) + } +```