diff --git a/DESCRIPTION b/DESCRIPTION index 77da1a1..f6fdfaa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: rtauargus Type: Package Title: Using Tau-Argus from R Language: fr -Version: 1.3.4 +Version: 1.3.5 Depends: R (>= 3.5.0) Imports: purrr (>= 0.2), @@ -67,7 +67,6 @@ Description: Protects tables by calling the Tau-Argus software from R. License: MIT + file LICENSE Encoding: UTF-8 LazyData: true -RoxygenNote: 7.3.3 VignetteBuilder: knitr URL: https://inseefrlab.github.io/rtauargus, https://github.com/inseefrlab/rtauargus, @@ -75,3 +74,4 @@ URL: https://inseefrlab.github.io/rtauargus, BugReports: https://github.com/inseefrlab/rtauargus/issues Roxygen: list(markdown = TRUE) StagedInstall: no +Config/roxygen2/version: 8.0.0 diff --git a/R/analyse_metadata.R b/R/analyse_metadata.R index ec155a2..d4dd6de 100644 --- a/R/analyse_metadata.R +++ b/R/analyse_metadata.R @@ -117,7 +117,7 @@ analyse_metadata <- function(df_metadata,df_eq_indicator = NULL,verbose = FALSE) list_independent_tables <- grp_tab_in_cluster(list_split = list_split, list_translation_tables = list_translation_tables) list_cluster_treat <- tab_to_treat(list_independent_tables) - dataframe_cluster_id <- dataframe_result(list_cluster_treat) + dataframe_cluster_id <- dataframe_result(list_cluster_treat,list_hrc_identified) if(verbose){ return(list( identify_hrc = list_hrc_identified[[1]], diff --git a/R/globals.R b/R/globals.R index 24bc305..7fdf7ed 100644 --- a/R/globals.R +++ b/R/globals.R @@ -4,6 +4,7 @@ utils::globalVariables( "n_unique","column","unique_modalities","from.eg","to.eg","from","to","mutual_full", "Group","table_eg","spanning","hrc_spanning","spanning_old","tab_inclus", "starts_with","spanning_name","hrc_spanning_name","eq_indicator","rhs","total","term_number", - "eq_name","unit","var","n_total","total_alt","group", - ".") + "eq_name","unit","var","n_total","total_alt","group","initial_indicator","spanning_new", + "spanning_key","all_sides","side","sides_manquants","table_name_combined","var_mapped", + "totcode","all_combinations","covered_sides",".") ) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index ff22b0f..0bc342a 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -47,10 +47,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # check that the input is in the right format: right column names check_column_names <- function(df) { - # Expected fixed column names fixed_columns <- c("eq_name", "eq_indicator", "unit") - - # Check that the fixed columns exist if (!all(fixed_columns %in% names(df))) { stop("Error: The dataframe describing the equations between indicators is missing one or more required columns: eq_name, eq_indicator, unit.") @@ -62,47 +59,18 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # with each rhs term placed in a separate column. parsed_equations <- df_eq_indicator %>% tidyr::separate(eq_indicator, into = c("total", "rhs"), sep = "=", extra = "merge") %>% - dplyr::mutate(rhs = trimws(rhs)) %>% tidyr::separate_rows(rhs, sep = "\\+") %>% - dplyr::mutate(rhs = trimws(rhs), - total = trimws(total)) %>% + dplyr::mutate(across(c(total, rhs), trimws)) %>% dplyr::group_by(dplyr::across(-rhs)) %>% dplyr::mutate(term_number = paste0("rhs", dplyr::row_number())) %>% tidyr::pivot_wider(names_from = term_number, values_from = rhs) %>% dplyr::ungroup() %>% dplyr::select(eq_name, unit, total, everything()) - # change to long format in order to join with df_metadata_long - equations_long <- parsed_equations %>% - mutate(across(c(total, starts_with("rhs")), trimws)) %>% - tidyr::pivot_longer( - cols = c(total, starts_with("rhs")), - names_to = "side", # côté équation (total / rhs1 / rhs2...) - values_to = "var" - ) %>% - filter(!is.na(var)) - - # Identify chained equations (A = B + C, B = D + E → group both equations together) - - # Build dependency links between totals and rhs - links <- parsed_equations %>% - tidyr::pivot_longer( - cols = starts_with("rhs"), - names_to = "rhs_term", - values_to = "rhs" - ) %>% - dplyr::filter(!is.na(rhs)) %>% - dplyr::mutate( - total = trimws(as.character(total)), - rhs = trimws(as.character(rhs)) - ) %>% - dplyr::distinct() - - # ---- 1) Identify ambiguous totals ---- + # Identify ambiguous totals total_counts <- parsed_equations %>% dplyr::count(total, name = "n_total") - ambiguous_totals <- total_counts %>% dplyr::filter(n_total > 1) %>% pull(total) - # ---- 2) Build a total -> total_alt mapping by eq_name ---- + # Build a total -> total_alt mapping by eq_name # For all equations (ambiguous or not), create one row; # for non-ambiguous totals, total_alt == total alt_map <- parsed_equations %>% @@ -110,158 +78,349 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ dplyr::group_by(total) %>% dplyr::arrange(eq_name) %>% # ordre stable dplyr::mutate(alt_idx = dplyr::row_number(), - total_alt = dplyr::case_when( - dplyr::n() == 1 ~ total, - alt_idx == 1 ~ total, - TRUE ~ paste0(total, "_alt", alt_idx - 1) - ) + total_alt = dplyr::case_when( + dplyr::n() == 1 ~ total, + alt_idx == 1 ~ total, + TRUE ~ paste0(total, "_alt", alt_idx - 1) + ) ) %>% dplyr::ungroup() %>% dplyr::select(eq_name, total, total_alt) - # ---- 3) Apply the mapping to the links ---- - # 'links' contains total, rhs, eq_name (if not, it must be joined beforehand) + # Identify chained equations (A = B + C, B = D + E) and group equations together + # and apply the mapping to the links # here we assume links has an eq_name column; otherwise do # left_join(links, parsed_equations %>% select(eq_name, total, rhs), ...) first - links_full <- links %>% - # replace total with its equation-specific alternative + links_full <- parsed_equations %>% + tidyr::pivot_longer( + cols = starts_with("rhs"), + names_to = "rhs_term", + values_to = "rhs" + ) %>% + dplyr::filter(!is.na(rhs)) %>% + dplyr::mutate( + total = trimws(as.character(total)), + rhs = trimws(as.character(rhs)) + ) %>% + dplyr::distinct() %>% left_join(alt_map, by = c("eq_name", "total")) %>% mutate(total = dplyr::coalesce(total_alt, total)) %>% select(-total_alt) %>% - # now replace rhs if it exists as a "total" in alt_map: - # we must choose the correct total_alt for rhs according to the equation - # where it plays the role of a total. - # to do so, join alt_map by mapping rhs -> total, keeping the alt - # corresponding to the SOURCE row eq_name. left_join(alt_map, by = c("eq_name", "rhs" = "total")) %>% mutate(rhs = dplyr::coalesce(total_alt, rhs)) %>% select(total, rhs, eq_name) %>% dplyr::distinct() - # ---- 4) Build the full graph (including all copies) ---- g_full <- graph_from_data_frame(links_full %>% select(total, rhs), directed = TRUE) - - # ---- 5) Compute components on g_full ---- comp_full <- igraph::components(g_full)$membership comp_df <- data.frame(var = names(comp_full), group = as.integer(comp_full), stringsAsFactors = FALSE) - # ---- 6) Update equations_long: - # associate the alternative variable (if present) and the corresponding group ---- - # Notes: - # - equations_long contains the original variables (var) and eq_name; - # - we want to recover the "var" or "var_alt" version used in g_full. + # reformat parsed_equations in long format in order to join with df_metadata_long + equations_long <- parsed_equations %>% + mutate(across(c(total, starts_with("rhs")), trimws)) %>% + tidyr::pivot_longer( + cols = c(total, starts_with("rhs")), + names_to = "side", + values_to = "var" + ) %>% + filter(!is.na(var)) + equations_long_full <- equations_long %>% - # join the correspondence eq_name + var (original total) -> total_alt (if any) left_join(alt_map, by = c("eq_name", "var" = "total")) %>% mutate(var_mapped = dplyr::coalesce(total_alt, var)) %>% select(-total_alt) %>% - # join the group computed on the full graph left_join(comp_df, by = c("var_mapped" = "var")) %>% - # for var_mapped without a group (isolated), keep NA or assign a single group mutate(group = as.integer(group)) # 'df_spannings' is a modified version of 'df_metadata_long' where: # - 'spanning' is replaced by its uppercase hierarchical version if available, # - 'indicator' is replaced by its uppercase hierarchical version - # (without the 'hrc_' prefix) if available. + # (without the 'hrc_' prefix) if available and 'indicator' not part of 'df_eq_indicator' + indic_not_in_eq <- setdiff(unique(df_metadata_long$indicator),unique(equations_long$var)) + + df_variable_info <- df_metadata_long %>% + mutate( + spanning_new = ifelse(is.na(hrc_spanning), spanning, toupper(hrc_spanning)) + ) %>% + distinct(var_start_name = spanning, var_end_name = spanning_new, table_name) + df_spannings <- df_metadata_long %>% - mutate(spanning_old = spanning) %>% - mutate(spanning = ifelse(is.na(hrc_spanning), - spanning, - toupper(hrc_spanning))) %>% - mutate(indicator = ifelse(is.na(hrc_indicator), - indicator, - toupper(sub("hrc_","",hrc_indicator)))) - - # 'df_variable_info' is a reference table linking original spanning names ('spanning_old') - # to their transformed counterparts ('spanning'), along with the corresponding table name. - df_variable_info <- data.frame( - var_start_name = df_spannings$spanning_old, - var_end_name = df_spannings$spanning, - table_name = df_spannings$table_name - ) %>% unique() - - # Update 'df_spannings' by removing the temporary 'spanning_old' column. - df_spannings <- df_spannings %>% select(-spanning_old) + mutate( + spanning = ifelse(is.na(hrc_spanning), spanning, toupper(hrc_spanning)), + indicator = ifelse(indicator %in% indic_not_in_eq & !is.na(hrc_indicator), + toupper(sub("hrc_", "", hrc_indicator)), indicator), + hrc_indicator = ifelse(indicator %in% unique(equations_long$var), NA, hrc_indicator) + ) df_spannings_eq <- df_spannings %>% - # delete all the non-word elements, specifically for the white spaces mutate(across(dplyr::where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% left_join(equations_long_full, by = c("indicator" = "var")) - df_eq_initial_spannings <- df_spannings_eq %>% - filter(!is.na(eq_name)) %>% + df_with_group <- df_spannings_eq %>% filter(!is.na(group)) + df_without_group <- df_spannings_eq %>% filter(is.na(group)) + + if (nrow(df_with_group) == 0) { + warning( + "Check the coherence of `df_eq_indicator` and `df_metadata`. + There is no table description in `df_metadata` with an indicator that is part of one of the equations provided in `df_eq_indicator`. + `df_eq_indicator` is useless here and will be ignored.") + if (nrow(df_without_group) > 0) { + if (all(is.na(df_without_group$hrc_indicator))) { + return(list(df_without_group, df_variable_info)) + } else { + df_no_eq_indicators <- build_spanning_based_on_hrc_indicator(df_without_group,df_spannings) + return(list(df_no_eq_indicators, df_variable_info)) + } + } + } + + spanning_combination_group <- df_with_group |> + group_by(group, table_name) |> + summarise( + spanning = list(sort(unique(spanning))), + side = first(side), + .groups = "drop" + ) |> + group_by(group) |> + mutate( + all_sides = list(sort(unique(side))), + spanning_key = purrr::map_chr(spanning, paste, collapse = "|") + ) |> + ungroup() + + spanning_combination_group <- spanning_combination_group |> + distinct(group, spanning_key, spanning, all_sides) |> + group_by(group, spanning_key) |> + summarise( + spanning = list(spanning[[1]]), + all_sides = list(all_sides[[1]]), + .groups = "drop" + ) |> + mutate( + covered_sides = purrr::map2(spanning, group, function(span_set, grp) { + spanning_combination_group |> + filter(group == grp) |> + filter(purrr::map_lgl(spanning, ~ all(span_set %in% .x))) |> + pull(side) |> + sort() |> + unique() + }), + sides_manquants = purrr::map2(all_sides, covered_sides, setdiff), + all_combinations = purrr::map_lgl(sides_manquants, ~ length(.x) == 0) + ) |> + unnest_wider(spanning, names_sep = "_") + + list_groups <- split(df_with_group, df_with_group$group) + + df_eq_initial_spannings <- purrr::map(list_groups, function(df_group) { + regroup_tables(df_group, spanning_combination_group) + }) |> + purrr::compact() |> + dplyr::bind_rows() + + table_group_mapping <- df_eq_initial_spannings %>% + mutate(table_name_combined = table_name) %>% + tidyr::separate_rows(table_name, sep = "\\.") %>% + select(table_name, table_name_combined, group) + + totcode_equation <- df_with_group %>% + filter(side == "total") %>% group_by(group) %>% - dplyr::reframe( - table_name = paste(unique(table_name), collapse = "."), - field = last(field), - hrc_field = last(hrc_field), - spanning = spanning, - hrc_spanning = hrc_spanning, - indicator = last(unit), - hrc_indicator = last(hrc_indicator) - ) %>% unique() + summarise(totcode = first(var_mapped), .groups = "drop") - # 'df_eq_indicator_spannings' defines the spanning information for equation indicators. - # Each equation name is transformed into its uppercase form with a "^h" suffix, - # and its hierarchical version prefixed with "hrc_". - df_eq_indicator_spannings <- df_spannings_eq %>% + df_eq_indicator_spannings <- df_with_group %>% filter(!is.na(eq_name)) %>% - group_by(group) %>% + left_join(table_group_mapping, by = c("table_name", "group")) %>% + left_join(totcode_equation, by = "group") %>% + group_by(group, table_name_combined) %>% summarise( - table_name = paste(unique(table_name), collapse = "."), + table_name = first(table_name_combined), field = last(field), hrc_field = last(hrc_field), - spanning = if(length(unique(eq_name)) > 1) { + spanning = if (length(unique(eq_name)) > 1) { paste0(paste0(unique(toupper(eq_name)), collapse = "_"), "^h") } else { paste0(toupper(last(eq_name)), "^h") }, - hrc_spanning = if(length(unique(eq_name)) > 1) { - paste0("hrc_", paste0(unique(toupper(eq_name)), collapse = "_")) + hrc_spanning = if (length(unique(eq_name)) > 1) { + paste0("hrc_", paste0(unique(toupper(eq_name)), collapse = "_"), ".totcode.", first(totcode)) } else { - paste0("hrc_", toupper(last(eq_name))) + paste0("hrc_", toupper(last(eq_name)), ".totcode.", first(totcode)) }, indicator = last(unit), hrc_indicator = last(hrc_indicator), .groups = "drop" - ) + ) %>% + dplyr::distinct(group, table_name, spanning, hrc_spanning, .keep_all = TRUE) - # 'df_indicators' combines both initial and indicator spanning information - # into a single harmonized dataset, keeping key structural columns - # and sorting rows by table name. - df_indicators <- bind_rows(df_eq_initial_spannings,df_eq_indicator_spannings) %>% - select(table_name,field,hrc_field,indicator,hrc_indicator,everything()) %>% + df_indicators <- bind_rows(df_eq_initial_spannings, df_eq_indicator_spannings) %>% + select(table_name, field, hrc_field, indicator, hrc_indicator, spanning, hrc_spanning, group) %>% + mutate(table_name = paste(table_name, "group", group, sep = "_")) %>% + select(-group) %>% + unique() %>% arrange(table_name) - # 'df_no_eq_spannings' contains all spanning rows - # that are not associated with any equation (eq_name is missing). - df_no_eq_spannings <- df_spannings_eq %>% filter(is.na(eq_name)) + df_initial_indicator <- bind_rows(df_eq_initial_spannings, df_eq_indicator_spannings) %>% + mutate(table_name = paste(table_name, "group", group, sep = "_")) %>% + group_by(table_name) %>% + summarise( + initial_indicator = { + x <- stats::na.omit(initial_indicator) + if (length(x) == 0) NA_character_ else x[1] + }, + .groups = "drop" + ) + + df_indicators <- df_indicators %>% + left_join(df_initial_indicator, by = "table_name") - if(nrow(df_no_eq_spannings) > 0){ - if(all(is.na(df_no_eq_spannings$hrc_indicator))){ - df_indicators <- bind_rows(df_indicators,df_no_eq_spannings) %>% arrange(table_name) - return(list(df_indicators,df_variable_info)) + # Tables without group (i.e. withtout group) + if (nrow(df_without_group) > 0) { + if (all(is.na(df_without_group$hrc_indicator))) { + df_indicators <- bind_rows(df_indicators, df_without_group) %>% arrange(table_name) + return(list(df_indicators, df_variable_info)) } else { - df_no_eq_indicators <- df_no_eq_spannings %>% - filter(!is.na(hrc_indicator)) %>% - dplyr::group_by(table_name) %>% - summarise( - field = last(field), - hrc_field = last(hrc_field), - spanning = paste0(toupper(last(hrc_indicator)),"^h"), - hrc_spanning = last(hrc_indicator), - indicator = last(indicator), - hrc_indicator = last(hrc_indicator) - ) %>% - bind_rows(df_spannings, .) %>% - arrange(table_name) - df_indicators <- bind_rows(df_indicators,df_no_eq_indicators) %>% arrange(table_name) - list_hrc_identified = list(df_indicators,df_variable_info) - return(list_hrc_identified) + df_no_eq_indicators <- build_spanning_based_on_hrc_indicator(df_without_group,df_spannings) + df_indicators <- bind_rows(df_indicators, df_no_eq_indicators) %>% arrange(table_name) + return(list(df_indicators, df_variable_info)) } } else { - list_hrc_identified = list(df_indicators,df_variable_info) - return(list_hrc_identified) + return(list(df_indicators, df_variable_info)) } } + +#' Build a data frame of indicators without equations +#' +#' Internal helper that aggregates rows from \code{df_without_group} that have +#' a non-\code{NA} \code{hrc_indicator}, and appends them to \code{df_spannings}. +#' The result is used to represent response variables that are linked by a +#' hierarchy but not by any equation. +#' +#' @param df_without_group A data frame containing the rows of +#' \code{df_spannings_eq} that do not belong to any equation group +#' (\code{group} is \code{NA}). Must contain the following columns: +#' \code{table_name}, \code{field}, \code{hrc_field}, \code{indicator}, +#' and \code{hrc_indicator}. +#' @param df_spannings A data frame derived from \code{df_metadata_long} with +#' renamed spanning and indicator variables. It is used as the base to which +#' the newly built rows are appended via \code{bind_rows}. +#' +#' @return A data frame with one row per \code{table_name} for the non-\code{NA} +#' \code{hrc_indicator} rows, appended to \code{df_spannings} and sorted by +#' \code{table_name}. The returned columns are: +#' \describe{ +#' \item{table_name}{Name of the table.} +#' \item{field}{Last value of \code{field} within the group.} +#' \item{hrc_field}{Last value of \code{hrc_field} within the group.} +#' \item{spanning}{Uppercase \code{hrc_indicator} suffixed with \code{^h}.} +#' \item{hrc_spanning}{Last value of \code{hrc_indicator} within the group.} +#' \item{indicator}{Last value of \code{indicator} within the group.} +#' \item{hrc_indicator}{Last value of \code{hrc_indicator} within the group.} +#' } +#' +#' @keywords internal +build_spanning_based_on_hrc_indicator <- function(df_without_group, df_spannings) { + df_without_group %>% + filter(!is.na(hrc_indicator)) %>% + dplyr::group_by(table_name) %>% + summarise( + field = last(field), + hrc_field = last(hrc_field), + spanning = paste0(toupper(last(hrc_indicator)), "^h"), + hrc_spanning = last(hrc_indicator), + indicator = last(indicator), + hrc_indicator = last(hrc_indicator) + ) %>% + bind_rows(df_spannings, .) %>% + arrange(table_name) +} + + +#' Regroup tables within a group (i.e. equation / group of linked equations) +#' based on spanning combination completeness +#' +#' @description +#' For a given group of tables, this function identifies which tables cover all +#' sides of an equation (total, rhs1, rhs2, ...) for their spanning combination, +#' and which do not. Tables with complete combinations are merged into a single +#' row; tables with incomplete combinations are kept as standalone rows with +#' their original spannings. +#' +#' @param df_group A tibble containing the rows of a single group from +#' \code{df_with_group}. Must contain columns: \code{table_name}, +#' \code{spanning}, \code{side}, \code{var_mapped}, \code{indicator}, +#' \code{unit}, and \code{group}. +#' @param spanning_combination_group A tibble produced by the +#' \code{spanning_combination_group} pipeline, containing one row per +#' (group, spanning_key) combination. Must contain columns: \code{group}, +#' \code{spanning_key}, and \code{all_combinations} (logical indicating +#' whether the spanning combination covers all sides of the equation). +#' +#' @return A tibble with one row per (merged or solo) table cluster and +#' spanning, containing the following columns (among others): +#' \describe{ +#' \item{table_name}{Dot-separated list of merged table names (e.g. +#' \code{"T7.T9.T11"}) for complete combinations, or the original +#' table name for incomplete ones.} +#' \item{indicator}{The unit value shared across the merged tables.} +#' \item{initial_indicator}{The \code{var_mapped} value of the \code{total} +#' side, used to track the original indicator before merging.} +#' } +#' +#' @examples +#' \dontrun{ +#' list_groups <- split(df_with_group, df_with_group$group) +#' +#' df_eq_initial_spannings <- purrr::map(list_groups, function(df_group) { +#' regroup_tables(df_group, spanning_combination_group) +#' }) |> +#' purrr::compact() |> +#' dplyr::bind_rows() +#' } +#' +#' @keywords internal +regroup_tables <- function(df_group, spanning_combination_group) { + current_group <- unique(df_group$group) + + spanning_by_table <- df_group |> + group_by(table_name) |> + summarise(spanning_key = paste(sort(unique(spanning)), collapse = "|"), .groups = "drop") + + span_comb <- spanning_combination_group |> + filter(group == current_group) |> + select(spanning_key, all_combinations) + + spanning_by_table <- spanning_by_table |> left_join(span_comb, by = "spanning_key") + + tables_complete <- spanning_by_table |> filter(all_combinations) |> pull(table_name) + tables_incomplete <- spanning_by_table |> filter(!all_combinations) |> pull(table_name) + + df_merged <- if (length(tables_complete) > 0) { + df_group |> + filter(table_name %in% tables_complete) |> + left_join(spanning_by_table |> select(table_name, spanning_key), by = "table_name") |> + group_by(across(-c(table_name, side, var_mapped, indicator))) |> + summarise( + table_name = paste(sort(unique(table_name)), collapse = "."), + indicator = last(unit), + initial_indicator = var_mapped[side == "total"][1], + .groups = "drop" + ) |> + select(-spanning_key) + } + + df_solo <- if (length(tables_incomplete) > 0) { + df_group |> + filter(table_name %in% tables_incomplete) |> + mutate( + initial_indicator = var_mapped[side == "total"][1], + indicator = unit + ) |> + select(-c(side, var_mapped)) + } + + bind_rows(df_merged, df_solo) |> + arrange(table_name, spanning) |> + select(table_name, field, hrc_field, indicator, everything()) +} + diff --git a/R/tab_to_treat.R b/R/tab_to_treat.R index b8f1d28..856c03b 100644 --- a/R/tab_to_treat.R +++ b/R/tab_to_treat.R @@ -85,6 +85,12 @@ tab_to_treat <- function(list_independent_tables) { #' @param list_independent_tables A list of tibbles, typically the output of #' `grp_tab_in_cluster()` or `tab_to_treat()`. Each tibble contains metadata #' for tables grouped within a specific cluster. +#' @param list_hrc_identified A list returned by the `identify_hrc` function. The first +#' element of the list must be a data frame containing the variables: +#' - `field`: A grouping variable. +#' - `hrc_field`: The hierarchical counterpart of `field`. +#' - `indicator`: A variable used to link tables. +#' - `hrc_indicator`: The hierarchical counterpart of `indicator`. #' #' @return A single dataframe (`dfMetadata_to_treat`) with the following structure: #' - `cluster`: Identifier for the cluster each table belongs to. @@ -130,24 +136,44 @@ tab_to_treat <- function(list_independent_tables) { #' } #' #' @importFrom purrr imap_dfr -dataframe_result <- function(list_independent_tables) { - # TODO modifier car il y a une erreur (column field doesn't exist) - # Combine the list of tibbles into a single dataframe with cluster identifiers +dataframe_result <- function(list_independent_tables, list_hrc_identified) { dataframe_metadata <- purrr::imap_dfr(list_independent_tables, function(tibble, tibble_name) { tibble %>% mutate(cluster = tibble_name) - }) %>% + }) + + # If the initial_indicator column exists in list_hrc_identified, + # replace indicator with initial_indicator whenever initial_indicator is not NA + if ("initial_indicator" %in% names(list_hrc_identified[[1]])) { + hrc_indicator_map <- list_hrc_identified %>% + purrr::map_dfr(identity) %>% + filter(!is.na(initial_indicator)) %>% + select(table_name, field, indicator, initial_indicator) %>% + distinct() + + dataframe_metadata <- dataframe_metadata %>% + left_join(hrc_indicator_map, by = c("table_name", "field", "indicator")) %>% + mutate(indicator = dplyr::if_else(!is.na(initial_indicator), initial_indicator, indicator)) %>% + select(-initial_indicator) + } + + dataframe_metadata <- dataframe_metadata %>% + mutate(table_name = gsub("_group_[0-9]+", "", table_name)) %>% select( cluster, table_name, field, indicator, - # Dynamically order columns spanning_xxx by their numeric suffix all_of(names(.)[grepl("^spanning_\\d+$", names(.))] %>% .[order(as.numeric(sub("spanning_", "", .)))]), - # Dynamically order columns hrc_spanning_xxx by their numeric suffix all_of(names(.)[grepl("^hrc_spanning_\\d+$", names(.))] %>% .[order(as.numeric(sub("hrc_spanning_", "", .)))]) - ) %>% as.data.frame() + ) %>% + as.data.frame() %>% + unique() # TODO come back to this, why where there duplicates in the first place + + return(dataframe_metadata) } + + diff --git a/man/build_spanning_based_on_hrc_indicator.Rd b/man/build_spanning_based_on_hrc_indicator.Rd new file mode 100644 index 0000000..140d51d --- /dev/null +++ b/man/build_spanning_based_on_hrc_indicator.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/identify_hrc_with_eq.R +\name{build_spanning_based_on_hrc_indicator} +\alias{build_spanning_based_on_hrc_indicator} +\title{Build a data frame of indicators without equations} +\usage{ +build_spanning_based_on_hrc_indicator(df_without_group, df_spannings) +} +\arguments{ +\item{df_without_group}{A data frame containing the rows of +\code{df_spannings_eq} that do not belong to any equation group +(\code{group} is \code{NA}). Must contain the following columns: +\code{table_name}, \code{field}, \code{hrc_field}, \code{indicator}, +and \code{hrc_indicator}.} + +\item{df_spannings}{A data frame derived from \code{df_metadata_long} with +renamed spanning and indicator variables. It is used as the base to which +the newly built rows are appended via \code{bind_rows}.} +} +\value{ +A data frame with one row per \code{table_name} for the non-\code{NA} +\code{hrc_indicator} rows, appended to \code{df_spannings} and sorted by +\code{table_name}. The returned columns are: +\describe{ +\item{table_name}{Name of the table.} +\item{field}{Last value of \code{field} within the group.} +\item{hrc_field}{Last value of \code{hrc_field} within the group.} +\item{spanning}{Uppercase \code{hrc_indicator} suffixed with \code{^h}.} +\item{hrc_spanning}{Last value of \code{hrc_indicator} within the group.} +\item{indicator}{Last value of \code{indicator} within the group.} +\item{hrc_indicator}{Last value of \code{hrc_indicator} within the group.} +} +} +\description{ +Internal helper that aggregates rows from \code{df_without_group} that have +a non-\code{NA} \code{hrc_indicator}, and appends them to \code{df_spannings}. +The result is used to represent response variables that are linked by a +hierarchy but not by any equation. +} +\keyword{internal} diff --git a/man/dataframe_result.Rd b/man/dataframe_result.Rd index 30f6a00..001ef5d 100644 --- a/man/dataframe_result.Rd +++ b/man/dataframe_result.Rd @@ -4,12 +4,21 @@ \alias{dataframe_result} \title{Combine List of Dataframes into a Single Dataframe with Cluster Identification} \usage{ -dataframe_result(list_independent_tables) +dataframe_result(list_independent_tables, list_hrc_identified) } \arguments{ \item{list_independent_tables}{A list of tibbles, typically the output of \code{grp_tab_in_cluster()} or \code{tab_to_treat()}. Each tibble contains metadata for tables grouped within a specific cluster.} + +\item{list_hrc_identified}{A list returned by the \code{identify_hrc} function. The first +element of the list must be a data frame containing the variables: +\itemize{ +\item \code{field}: A grouping variable. +\item \code{hrc_field}: The hierarchical counterpart of \code{field}. +\item \code{indicator}: A variable used to link tables. +\item \code{hrc_indicator}: The hierarchical counterpart of \code{indicator}. +}} } \value{ A single dataframe (\code{dfMetadata_to_treat}) with the following structure: diff --git a/man/regroup_tables.Rd b/man/regroup_tables.Rd new file mode 100644 index 0000000..78f6df3 --- /dev/null +++ b/man/regroup_tables.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/identify_hrc_with_eq.R +\name{regroup_tables} +\alias{regroup_tables} +\title{Regroup tables within a group (i.e. equation / group of linked equations) +based on spanning combination completeness} +\usage{ +regroup_tables(df_group, spanning_combination_group) +} +\arguments{ +\item{df_group}{A tibble containing the rows of a single group from +\code{df_with_group}. Must contain columns: \code{table_name}, +\code{spanning}, \code{side}, \code{var_mapped}, \code{indicator}, +\code{unit}, and \code{group}.} + +\item{spanning_combination_group}{A tibble produced by the +\code{spanning_combination_group} pipeline, containing one row per +(group, spanning_key) combination. Must contain columns: \code{group}, +\code{spanning_key}, and \code{all_combinations} (logical indicating +whether the spanning combination covers all sides of the equation).} +} +\value{ +A tibble with one row per (merged or solo) table cluster and +spanning, containing the following columns (among others): +\describe{ +\item{table_name}{Dot-separated list of merged table names (e.g. +\code{"T7.T9.T11"}) for complete combinations, or the original +table name for incomplete ones.} +\item{indicator}{The unit value shared across the merged tables.} +\item{initial_indicator}{The \code{var_mapped} value of the \code{total} +side, used to track the original indicator before merging.} +} +} +\description{ +For a given group of tables, this function identifies which tables cover all +sides of an equation (total, rhs1, rhs2, ...) for their spanning combination, +and which do not. Tables with complete combinations are merged into a single +row; tables with incomplete combinations are kept as standalone rows with +their original spannings. +} +\examples{ +\dontrun{ +list_groups <- split(df_with_group, df_with_group$group) + +df_eq_initial_spannings <- purrr::map(list_groups, function(df_group) { + regroup_tables(df_group, spanning_combination_group) +}) |> + purrr::compact() |> + dplyr::bind_rows() +} + +} +\keyword{internal} diff --git a/man/rtauargus-package.Rd b/man/rtauargus-package.Rd index 29e9789..5f1c96b 100644 --- a/man/rtauargus-package.Rd +++ b/man/rtauargus-package.Rd @@ -23,6 +23,7 @@ Useful links: Authors: \itemize{ + \item Julien Jamme \email{julien.jamme@insee.fr} \item Pierre-Yves Berrard \email{pierre-yves.berrard@insee.fr} \item Nathanaël Rastout \email{nathanael.rastout@insee.fr} \item Jeanne Pointet diff --git a/tests/testthat/test_analyse_metadata.R b/tests/testthat/test_analyse_metadata.R index a075c40..ab0f96e 100644 --- a/tests/testthat/test_analyse_metadata.R +++ b/tests/testthat/test_analyse_metadata.R @@ -162,10 +162,25 @@ test_that("hierarchies on indicators", { ##################################################### INDICATOR EQUATIONS CHECKS # all the spanning variables are taken into account when using equations on # indicators ------------------------------------------------------------------- +df_eq_lettuce_1 <- data.frame( + eq_name = c("eq1"), + eq_indicator = c("to_lettuce = to_batavia + to_arugula"), + unit = c("EUR"), + stringsAsFactors = FALSE +) + +df_eq_lettuce_2 <- data.frame( + eq_name = c("eq1","eq2"), + eq_indicator = c("to_lettuce = to_batavia + to_arugula", + "to_pizza = to_tomates + to_pate"), + unit = c("EUR","EUR"), + stringsAsFactors = FALSE +) + answer <- data.frame( cluster = c( - "france_entreprises_2023.hrc_lettuce", - "france_entreprises_2023.hrc_lettuce", + "france_entreprises_2023.EUR", + "france_entreprises_2023.EUR", "france_entreprises_2023.to_pizza", "france_entreprises_2023.to_pizza" ), @@ -176,26 +191,201 @@ answer <- data.frame( "T3.T4.T5.T6" ), field = rep("france_entreprises_2023", 4), - indicator = c("LETTUCE", "LETTUCE", "to_pizza", "to_pizza"), + indicator = c("to_lettuce", "to_lettuce", "to_pizza", "to_pizza"), spanning_1 = c("HRC_NAF", "HRC_NAF", "HRC_NUTS", "HRC_NAF"), spanning_2 = c("cj", "size", "size", "HRC_NUTS"), - spanning_3 = c("HRC_LETTUCE^h", "HRC_LETTUCE^h", NA, NA), + spanning_3 = c("EQ1^h", "EQ1^h", NA, NA), hrc_spanning_1 = c("hrc_naf", "hrc_naf", "hrc_nuts", "hrc_naf"), hrc_spanning_2 = c(NA, NA, NA, "hrc_nuts"), - hrc_spanning_3 = c("hrc_lettuce", "hrc_lettuce", NA, NA) + hrc_spanning_3 = c("hrc_EQ1.totcode.to_lettuce", "hrc_EQ1.totcode.to_lettuce", NA, NA) ) test_that("indicators equation", { - df_eq_ex <- data.frame( - eq_name = c("eq1"), - eq_indicator = c("ca_salades = ca_batavia + ca_mache"), - unit = c("EUR"), - stringsAsFactors = FALSE + expect_warning( + expect_equal( + analyse_metadata(df_metadata = metadata_pizza_lettuce,df_eq_indicator = df_eq_lettuce_1), + answer + ), + "hrc_indicator column will be ignored" + ) + +} +) + +# All indicators in the same equation broken down by the same spanning variable ---- +answer <- data.frame( + cluster = c("france_entreprises_2023.EUR"), + table_name = c("T11.T7.T9"), + field = c("france_entreprises_2023"), + indicator = c("to_lettuce"), + spanning_1 = c("a10"), + spanning_2 = c("EQ1^h"), + hrc_spanning_1 = NA_character_, + hrc_spanning_2 = c("hrc_EQ1.totcode.to_lettuce") +) + +test_that("meme_var_crois_1", { + meta <- metadata_pizza_lettuce[,c(1:7)] %>% filter(table_name %in% c("T7","T9","T11")) + meta$hrc_spanning_1 <- NA_character_ + + expect_warning( + expect_equal( + analyse_metadata(df_metadata = meta,df_eq_indicator = df_eq_lettuce_1), + answer + ), + "hrc_indicator column will be ignored" + ) + +} +) + +# All indicators in the same equation broken down by the same spanning variables ---- +answer <- data.frame( + cluster = c("france_entreprises_2023.EUR","france_entreprises_2023.EUR"), + table_name = c("T10.T12.T8","T11.T7.T9"), + field = c("france_entreprises_2023","france_entreprises_2023"), + indicator = c("to_lettuce","to_lettuce"), + spanning_1 = c("a10","a10"), + spanning_2 = c("cj","size"), + spanning_3 = c("EQ1^h","EQ1^h"), + hrc_spanning_1 = NA_character_, + hrc_spanning_2 = NA_character_, + hrc_spanning_3 = c("hrc_EQ1.totcode.to_lettuce","hrc_EQ1.totcode.to_lettuce") +) + +test_that("meme_var_crois_2", { + meta <- metadata_pizza_lettuce[c(7:12),] + meta$hrc_spanning_1 <- NA_character_ + + expect_warning( + expect_equal( + analyse_metadata(df_metadata = meta,df_eq_indicator = df_eq_lettuce_1), + answer + ), + "hrc_indicator column will be ignored" + ) + +} +) + +# Two equations, all indiicators broken down by the same spanning variable ----- +answer <- data.frame( + cluster = c("france_entreprises_2023.EUR","france_entreprises_2023.EUR"), + table_name = c("T10.T11.T12.T7.T8.T9","T4.T5.T6"), + field = c("france_entreprises_2023","france_entreprises_2023"), + indicator = c("to_lettuce","to_pizza"), + spanning_1 = c("a10","a10"), + spanning_2 = c("EQ1^h","EQ2^h"), + hrc_spanning_1 = NA_character_, + hrc_spanning_2 = c("hrc_EQ1.totcode.to_lettuce","hrc_EQ2.totcode.to_pizza") +) + +test_that("meme_var_crois_1_deux_eq", { + meta <- metadata_pizza_lettuce[c(4:12),c(1:7)] + meta$indicator <- c("to_pizza","to_tomates","to_pate","to_batavia","to_batavia","to_arugula","to_arugula","to_lettuce","to_lettuce") + meta <- meta %>% mutate(spanning_1 = "a10",hrc_spanning_1 = NA_character_) + + expect_warning( + expect_equal( + analyse_metadata(df_metadata = meta,df_eq_indicator = df_eq_lettuce_2), + answer + ), + "hrc_indicator column will be ignored" ) +} +) + +# Two equations, all indicators in each equations are broken down by the same +# spanning variables ----------------------------------------------------------- +answer <- data.frame( + cluster = rep("france_entreprises_2023.EUR"), + table_name = c("T10.T12.T8","T11.T7.T9","T4.T5.T6"), + field = rep("france_entreprises_2023"), + indicator = c("to_lettuce","to_lettuce","to_pizza"), + spanning_1 = rep("a10"), + spanning_2 = c("cj","size","size"), + spanning_3 = c("EQ1^h","EQ1^h","EQ2^h"), + hrc_spanning_1 = NA_character_, + hrc_spanning_2 = NA_character_, + hrc_spanning_3 = c("hrc_EQ1.totcode.to_lettuce","hrc_EQ1.totcode.to_lettuce","hrc_EQ2.totcode.to_pizza") +) + +test_that("meme_var_crois_2_deux_eq", { + meta <- metadata_pizza_lettuce[c(4:12),] + meta$indicator <- c("to_pizza","to_tomates","to_pate","to_batavia","to_batavia","to_arugula","to_arugula","to_lettuce","to_lettuce") + meta <- meta %>% mutate(spanning_1 = "a10", + hrc_spanning_1 = NA_character_, + spanning_2 = c("size","size","size","size","cj","size","cj","size","cj"), + hrc_spanning_2 = NA_character_) + + expect_warning( + expect_equal( + analyse_metadata(df_metadata = meta,df_eq_indicator = df_eq_lettuce_2), + answer + ), + "hrc_indicator column will be ignored" + ) + +} +) + +# One equation, but the indicators are not broken down by the same spanning +# variables (only one spanning variable by table) ------------------------------ +answer <- data.frame( + cluster = rep("france_entreprises_2023.EUR"), + table_name = c("T11","T7.T9"), + field = rep("france_entreprises_2023"), + indicator = c("to_lettuce","EUR"), + spanning_1 = c("cj","a10"), + spanning_2 = c("EQ1^h","EQ1^h"), + hrc_spanning_1 = NA_character_, + hrc_spanning_2 = c("hrc_EQ1.totcode.to_lettuce","hrc_EQ1.totcode.to_lettuce") +) + + +test_that("pas_meme_var_crois_1", { + meta <- metadata_pizza_lettuce[,c(1:7)] %>% + filter(table_name %in% c("T7","T9","T11")) %>% + mutate(spanning_1 = c("a10","a10","cj")) + meta$hrc_spanning_1 <- NA_character_ + + expect_warning( + expect_equal( + analyse_metadata(df_metadata = meta,df_eq_indicator = df_eq_lettuce_1), + answer + ), + "hrc_indicator column will be ignored" + ) + +} +) + +# One equation, but the indicators are not broken down by the same spanning +# variables (one or two spanning variable by table) ------------------------------ +answer <- data.frame( + cluster = rep("france_entreprises_2023.EUR"), + table_name = c("T1.T2","T1.T2.T3"), + field = rep("france_entreprises_2023"), + indicator = c("EUR","EUR"), + spanning_1 = c("a10","a10"), + spanning_2 = c("EQ1^h","size"), + spanning_3 = c(NA,"EQ1^h"), + hrc_spanning_1 = NA_character_, + hrc_spanning_2 = c("hrc_EQ1.totcode.to_lettuce",NA), + hrc_spanning_3 = c(NA,"hrc_EQ1.totcode.to_lettuce") +) + +test_that("pas_meme_var_crois_2", { + meta <- metadata_pizza_lettuce %>% filter(table_name %in% c("T7","T9","T11")) + meta$spanning_2 <- c(NA,NA,"size") + meta$hrc_spanning_1 <- NA_character_ + meta$hrc_indicator <- NA_character_ + meta$table_name <- c("T1","T2","T3") + expect_warning( expect_equal( - analyse_metadata(df_metadata = metadata_pizza_lettuce,df_eq_indicator = df_eq_ex), + analyse_metadata(df_metadata = meta,df_eq_indicator = df_eq_lettuce_1), answer ), "hrc_indicator column will be ignored" diff --git a/vignettes/auto_metadata.Rmd b/vignettes/auto_metadata.Rmd index 42e101f..229215e 100644 --- a/vignettes/auto_metadata.Rmd +++ b/vignettes/auto_metadata.Rmd @@ -141,16 +141,16 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS -## hrc_spanning_1 hrc_spanning_2 hrc_spanning_3 -## 1 hrc_naf hrc_lettuce -## 2 hrc_naf hrc_lettuce -## 3 hrc_nuts -## 4 hrc_naf hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 ``` For the 12 tables to be published, it is sufficient to protect 4 tables. These tables are distributed across two different clusters. Therefore, `tab_multi_manager()` needs to be called twice. @@ -164,8 +164,8 @@ names(detailed_analysis) ``` ``` -## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" -## [6] "grp_tab_in_clusters" "tab_to_treat" "df_tab_to_treat" +## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" +## [7] "tab_to_treat" "df_tab_to_treat" ``` One finds the dataframe with the cluster indicator `df_tab_to_treat`. The result is the same but in list format: each element of the list is an independent cluster `tab_to_treat`. Additionally, the 6 steps of the analysis are included. @@ -232,20 +232,21 @@ Here, there is no hierarchical link between `SAL` (employees of active companies ``` r # cas où il n'y a aucune hiérarchie sur les indicateurs metadata_template <- template_formatted$metadata %>% - mutate(hrc_indicator = NA) %>% + mutate(hrc_field = NA, + hrc_indicator = NA) %>% select(table_name,field,indicator,hrc_indicator, everything()) metadata_template ``` ``` -## table_name field indicator hrc_indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 -## 1 table_2021_SAL_DTH_1 2021 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 2 table_2021_SAL_DTH_2 2021 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 -## 3 table_2022_SAL_1 2022 SAL NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 4 table_2022_SAL_2 2022 SAL NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 -## 5 table_2022_SAL_DTH_1 2022 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 6 table_2022_SAL_DTH_2 2022 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 +## table_name field indicator hrc_indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 hrc_field +## 1 table_2021_SAL_DTH_1 2021 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 2 table_2021_SAL_DTH_2 2021 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA +## 3 table_2022_SAL_1 2022 SAL NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 4 table_2022_SAL_2 2022 SAL NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA +## 5 table_2022_SAL_DTH_1 2022 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 6 table_2022_SAL_DTH_2 2022 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA ``` Next, this dataframe is used as input for the analysis function. @@ -254,39 +255,22 @@ Next, this dataframe is used as input for the analysis function. ``` r # Analyse complète, avec les étapes detailed_analysis <- analyse_metadata(metadata_template, verbose = TRUE) -``` -``` -## Error in `check_column_names()`: -## ! Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. -``` - -``` r # Output simplifié, uniquement le dataframe avec l'indicatrice de cluster cluster_id_dataframe <- analyse_metadata(metadata_template, verbose = FALSE) -``` -``` -## Error in `check_column_names()`: -## ! Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. -``` - -``` r # visualisation du résultat de l'analyse cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS -## hrc_spanning_1 hrc_spanning_2 hrc_spanning_3 -## 1 hrc_naf hrc_lettuce -## 2 hrc_naf hrc_lettuce -## 3 hrc_nuts -## 4 hrc_naf hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 +## 1 2021.SAL_DTH table_2021_SAL_DTH_1 2021 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 2 2021.SAL_DTH table_2021_SAL_DTH_2 2021 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 +## 3 2022.SAL table_2022_SAL_1 2022 SAL HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 4 2022.SAL table_2022_SAL_2 2022 SAL HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 +## 5 2022.SAL_DTH table_2022_SAL_DTH_1 2022 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 6 2022.SAL_DTH table_2022_SAL_DTH_2 2022 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 ``` Ultimately, there are 6 tables to process in 3 different clusters. In other words, `tab_multi_manager()` will need to be called three times. @@ -340,17 +324,6 @@ detailed_analysis <- analyse_metadata(metadata_pizza_lettuce, ## the hrc_indicator column will be ignored. ``` -``` -## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0. -## ℹ Please use `reframe()` instead. -## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust -## accordingly. -## ℹ The deprecated feature was likely used in the rtauargus package. -## Please report the issue at . -## This warning is displayed once per session. -## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated. -``` - ``` r # Simplified output, only the data frame with the cluster indicator cluster_id_dataframe <- analyse_metadata(metadata_pizza_lettuce, verbose = FALSE) @@ -368,62 +341,29 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS -## hrc_spanning_1 hrc_spanning_2 hrc_spanning_3 -## 1 hrc_naf hrc_lettuce -## 2 hrc_naf hrc_lettuce -## 3 hrc_nuts -## 4 hrc_naf hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 ``` # Going Further: Visualizing Inclusions -The create_edges step in the metadata analysis identifies tables included within other tables. For example, XXXXX is included in XXXXX. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. +The create_edges step in the metadata analysis identifies tables included within other tables. For example, `T1` is included in `T2`. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. ``` r library(rtauargus) library(igraph) -``` - -``` -## -## Attaching package: 'igraph' -``` - -``` -## The following objects are masked from 'package:dplyr': -## -## as_data_frame, groups, union -``` - -``` -## The following objects are masked from 'package:stats': -## -## decompose, spectrum -``` - -``` -## The following object is masked from 'package:base': -## -## union -``` - -``` r library(visNetwork) -``` -``` -## Error in `library()`: -## ! there is no package called 'visNetwork' -``` - -``` r graph_links_tab <- function(list_desc_links){ list_desc_links %>% purrr::imap(function(ss_dem,i){ if(!is.null(ss_dem)){ @@ -446,10 +386,8 @@ graph_links_tab(detailed_analysis$create_edges) ``` ``` -## Error in `map2()`: -## ℹ In index: 1. -## ℹ With name: france_entreprises_2023.hrc_lettuce. -## Caused by error in `visOptions()`: -## ! could not find function "visOptions" +## $france_entreprises_2023.hrc_lettuce +## +## $france_entreprises_2023.to_pizza ``` diff --git a/vignettes/auto_metadata.Rmd.orig b/vignettes/auto_metadata.Rmd.orig index e94ebf1..afba68a 100644 --- a/vignettes/auto_metadata.Rmd.orig +++ b/vignettes/auto_metadata.Rmd.orig @@ -16,7 +16,7 @@ vignette: > knitr::opts_chunk$set(echo = TRUE) ``` -```{r message = FALSE} +```{r message = FALSE, warning = FALSE} library(rtauargus) library(dplyr) ``` @@ -150,7 +150,8 @@ Here, there is no hierarchical link between `SAL` (employees of active companies ```{r} # cas où il n'y a aucune hiérarchie sur les indicateurs metadata_template <- template_formatted$metadata %>% - mutate(hrc_indicator = NA) %>% + mutate(hrc_field = NA, + hrc_indicator = NA) %>% select(table_name,field,indicator,hrc_indicator, everything()) metadata_template @@ -215,9 +216,9 @@ cluster_id_dataframe # Going Further: Visualizing Inclusions -The create_edges step in the metadata analysis identifies tables included within other tables. For example, XXXXX is included in XXXXX. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. +The create_edges step in the metadata analysis identifies tables included within other tables. For example, `T1` is included in `T2`. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. -```{r} +```{r message = FALSE, warning = FALSE} library(rtauargus) library(igraph) library(visNetwork) diff --git a/vignettes/auto_metadata_fr.Rmd b/vignettes/auto_metadata_fr.Rmd index d98f6dd..44ae776 100644 --- a/vignettes/auto_metadata_fr.Rmd +++ b/vignettes/auto_metadata_fr.Rmd @@ -6,10 +6,10 @@ output: toc: true toc_depth: 3 vignette: > -%\VignetteIndexEntry{Analyse automatique des métadonnées} + %\VignetteIndexEntry{Automatic analysis of metadata} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} - --- +--- @@ -140,16 +140,16 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf -## hrc_spanning_2 hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 ``` Pour les 12 tableaux à publier il suffit de protéger 4 tableaux. Ces tableaux sont repartis dans deux clusters différents. Il faudra donc faire appel deux fois à `tab_multi_manager()`. @@ -231,20 +231,21 @@ Ici, il n'y a pas de lien hiérarchique entre `SAL` (effectifs salariés des ent ``` r # cas où il n'y a aucune hiérarchie sur les indicateurs metadata_template <- template_formatted$metadata %>% - mutate(hrc_indicator = NA) %>% + mutate(hrc_field = NA, + hrc_indicator = NA) %>% select(table_name,field,indicator,hrc_indicator, everything()) metadata_template ``` ``` -## table_name field indicator hrc_indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 -## 1 table_2021_SAL_DTH_1 2021 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 2 table_2021_SAL_DTH_2 2021 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 -## 3 table_2022_SAL_1 2022 SAL NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 4 table_2022_SAL_2 2022 SAL NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 -## 5 table_2022_SAL_DTH_1 2022 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 6 table_2022_SAL_DTH_2 2022 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 +## table_name field indicator hrc_indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 hrc_field +## 1 table_2021_SAL_DTH_1 2021 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 2 table_2021_SAL_DTH_2 2021 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA +## 3 table_2022_SAL_1 2022 SAL NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 4 table_2022_SAL_2 2022 SAL NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA +## 5 table_2022_SAL_DTH_1 2022 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 6 table_2022_SAL_DTH_2 2022 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA ``` Ensuite, on utilise ce dataframe en input de la fonction d'analyse. @@ -253,37 +254,22 @@ Ensuite, on utilise ce dataframe en input de la fonction d'analyse. ``` r # Analyse complète, avec les étapes detailed_analysis <- analyse_metadata(metadata_template, verbose = TRUE) -``` - -``` -## Error in check_column_names(df_metadata): Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. -``` -``` r # Output simplifié, uniquement le dataframe avec l'indicatrice de cluster cluster_id_dataframe <- analyse_metadata(metadata_template, verbose = FALSE) -``` -``` -## Error in check_column_names(df_metadata): Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. -``` - -``` r # visualisation du résultat de l'analyse cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf -## hrc_spanning_2 hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 +## 1 2021.SAL_DTH table_2021_SAL_DTH_1 2021 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 2 2021.SAL_DTH table_2021_SAL_DTH_2 2021 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 +## 3 2022.SAL table_2022_SAL_1 2022 SAL HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 4 2022.SAL table_2022_SAL_2 2022 SAL HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 +## 5 2022.SAL_DTH table_2022_SAL_DTH_1 2022 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 6 2022.SAL_DTH table_2022_SAL_DTH_2 2022 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 ``` Finalement, il y a 6 tableaux à traiter dans 3 clusters différents. Autrement dit, il faudra faire trois fois appel à `tab_multi_manager()`. @@ -336,10 +322,6 @@ detailed_analysis <- analyse_metadata(metadata_pizza_lettuce, ## the hrc_indicator column will be ignored. ``` -``` -## Error in components(g_full): impossible de trouver la fonction "components" -``` - ``` r # Output simplifié, uniquement le dataframe avec l'indicatrice de cluster cluster_id_dataframe <- analyse_metadata(metadata_pizza_lettuce, verbose = FALSE) @@ -358,51 +340,25 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf -## hrc_spanning_2 hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 ``` # Pour aller plus loin : visualiser les inclusions -L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, XXXXX est inclus dans XXXXX. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse. +L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, `T1` est inclus dans `T2`. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse. ``` r library(rtauargus) library(igraph) -``` - -``` -## -## Attachement du package : 'igraph' -``` - -``` -## Les objets suivants sont masqués depuis 'package:dplyr': -## -## as_data_frame, groups, union -``` - -``` -## Les objets suivants sont masqués depuis 'package:stats': -## -## decompose, spectrum -``` - -``` -## L'objet suivant est masqué depuis 'package:base': -## -## union -``` - -``` r library(visNetwork) graph_links_tab <- function(list_desc_links){ diff --git a/vignettes/auto_metadata_fr.Rmd.orig b/vignettes/auto_metadata_fr.Rmd.orig index f0c8159..b61d857 100644 --- a/vignettes/auto_metadata_fr.Rmd.orig +++ b/vignettes/auto_metadata_fr.Rmd.orig @@ -6,17 +6,17 @@ output: toc: true toc_depth: 3 vignette: > -%\VignetteIndexEntry{Analyse automatique des métadonnées} + %\VignetteIndexEntry{Automatic analysis of metadata} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} - --- +--- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` -```{r message = FALSE} +```{r message = FALSE, warning = FALSE} library(rtauargus) library(dplyr) ``` @@ -149,7 +149,8 @@ Ici, il n'y a pas de lien hiérarchique entre `SAL` (effectifs salariés des ent ```{r} # cas où il n'y a aucune hiérarchie sur les indicateurs metadata_template <- template_formatted$metadata %>% - mutate(hrc_indicator = NA) %>% + mutate(hrc_field = NA, + hrc_indicator = NA) %>% select(table_name,field,indicator,hrc_indicator, everything()) metadata_template @@ -212,9 +213,9 @@ cluster_id_dataframe ``` # Pour aller plus loin : visualiser les inclusions -L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, XXXXX est inclus dans XXXXX. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse. +L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, `T1` est inclus dans `T2`. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse. -```{r} +```{r message = FALSE, warning = FALSE} library(rtauargus) library(igraph) library(visNetwork)