From c8fc22fc7b544562baa6f9719aa7af8fed2cf00b Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Mon, 11 May 2026 16:07:34 +0200 Subject: [PATCH 01/38] =?UTF-8?q?fix:=20dans=20le=20r=C3=A9sultat=20on=20a?= =?UTF-8?q?vait=20un=20tableau=20ventil=C3=A9=20plusieurs=20fois=20par=20l?= =?UTF-8?q?a=20m=C3=AAme=20variable=20de=20croisement.=20Cela=20=C3=A9tait?= =?UTF-8?q?=20d=C3=BB=20au=20fait=20que=20dans=20un=20m=C3=AAme=20groupe?= =?UTF-8?q?=20(var=20group)=20des=20variables=20de=20croisement=20sont=20r?= =?UTF-8?q?=C3=A9p=C3=A9t=C3=A9es.=20C'est=20attendu=20puisque=20les=20ind?= =?UTF-8?q?icateurs=20d'une=20m=C3=AAme=20=C3=A9quation=20sont=20normaleme?= =?UTF-8?q?nt=20ventil=C3=A9s=20par=20les=20m=C3=AAmes=20variables=20de=20?= =?UTF-8?q?croisement.=20L'ajout=20du=20distinct()=20permet=20de=20garder?= =?UTF-8?q?=20dans=20chaque=20groupe=20qu'une=20seule=20fois=20la=20variab?= =?UTF-8?q?le=20de=20croisement.=20Autrement=20dit,=20si=20une=20=C3=A9qua?= =?UTF-8?q?tion=20A=20=3D=20B=20+=20C=20d=C3=A9finit=20le=20group=201=20et?= =?UTF-8?q?=20que=20A,=20B=20et=20C=20sont=20ventil=C3=A9es=20par=20nb=5Fe?= =?UTF-8?q?mpl=20on=20n'a=20plus=20nb=5Fempl=20r=C3=A9p=C3=A9t=C3=A9=203?= =?UTF-8?q?=20fois=20mais=20seulement=201=20fois?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/identify_hrc_with_eq.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index ff22b0f..f9ecc67 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -200,7 +200,8 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ hrc_spanning = hrc_spanning, indicator = last(unit), hrc_indicator = last(hrc_indicator) - ) %>% unique() + ) %>% + distinct(group, spanning, hrc_spanning, .keep_all = TRUE) # 'df_eq_indicator_spannings' defines the spanning information for equation indicators. # Each equation name is transformed into its uppercase form with a "^h" suffix, From e141a7d4ad451790a684cbc6ed7d3b233df9c06d Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Mon, 11 May 2026 16:35:09 +0200 Subject: [PATCH 02/38] =?UTF-8?q?fix:=20m=C3=AAme=20chose=20que=20pour=20l?= =?UTF-8?q?e=20commit=20pr=C3=A9c=C3=A9dent=20(distinct()=20sur=20df=5Feq?= =?UTF-8?q?=5Finital=5Fspannings)=20mais=20cette=20fois-ci=20sur=20la=20pa?= =?UTF-8?q?rtie=20variables=20de=20croisement=20sur=20les=20indicateurs=20?= =?UTF-8?q?qui=20sont=20dans=20des=20=C3=A9quations=20(df=5Feq=5Findicator?= =?UTF-8?q?=5Fspannings)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/identify_hrc_with_eq.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index f9ecc67..83dd55c 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -201,7 +201,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ indicator = last(unit), hrc_indicator = last(hrc_indicator) ) %>% - distinct(group, spanning, hrc_spanning, .keep_all = TRUE) + dplyr::distinct(group, spanning, hrc_spanning, .keep_all = TRUE) # 'df_eq_indicator_spannings' defines the spanning information for equation indicators. # Each equation name is transformed into its uppercase form with a "^h" suffix, @@ -226,7 +226,8 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ indicator = last(unit), hrc_indicator = last(hrc_indicator), .groups = "drop" - ) + ) %>% + dplyr::distinct(group, spanning, hrc_spanning, .keep_all = TRUE) # 'df_indicators' combines both initial and indicator spanning information # into a single harmonized dataset, keeping key structural columns From bb37d21ed4164b501860da940744515a82081bc0 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Thu, 28 May 2026 16:50:04 +0200 Subject: [PATCH 03/38] docs: change the header to fix unknown title + hide warnings and correct XXXXX --- vignettes/auto_metadata_fr.Rmd | 106 ++++++++++------------------ vignettes/auto_metadata_fr.Rmd.orig | 10 +-- 2 files changed, 44 insertions(+), 72 deletions(-) diff --git a/vignettes/auto_metadata_fr.Rmd b/vignettes/auto_metadata_fr.Rmd index d98f6dd..8ffc4b7 100644 --- a/vignettes/auto_metadata_fr.Rmd +++ b/vignettes/auto_metadata_fr.Rmd @@ -6,10 +6,10 @@ output: toc: true toc_depth: 3 vignette: > -%\VignetteIndexEntry{Analyse automatique des métadonnées} + %\VignetteIndexEntry{Automatic analysis of metadata} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} - --- +--- @@ -140,16 +140,16 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf -## hrc_spanning_2 hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 ``` Pour les 12 tableaux à publier il suffit de protéger 4 tableaux. Ces tableaux sont repartis dans deux clusters différents. Il faudra donc faire appel deux fois à `tab_multi_manager()`. @@ -163,8 +163,8 @@ names(detailed_analysis) ``` ``` -## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" -## [7] "tab_to_treat" "df_tab_to_treat" +## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" "tab_to_treat" +## [8] "df_tab_to_treat" ``` On retrouve le dataframe avec l'indicatrice du cluster `df_tab_to_treat`. On a le même résultat mais en format liste : chaque élément de la sous-liste `tab_to_treat` est un cluster indépendant. Les autres éléments de la liste sont les étapes de l'analyse. @@ -256,7 +256,8 @@ detailed_analysis <- analyse_metadata(metadata_template, verbose = TRUE) ``` ``` -## Error in check_column_names(df_metadata): Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. +## Error in `check_column_names()`: +## ! Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. ``` ``` r @@ -265,7 +266,8 @@ cluster_id_dataframe <- analyse_metadata(metadata_template, verbose = FALSE) ``` ``` -## Error in check_column_names(df_metadata): Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. +## Error in `check_column_names()`: +## ! Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. ``` ``` r @@ -274,16 +276,16 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf -## hrc_spanning_2 hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 ``` Finalement, il y a 6 tableaux à traiter dans 3 clusters différents. Autrement dit, il faudra faire trois fois appel à `tab_multi_manager()`. @@ -336,10 +338,6 @@ detailed_analysis <- analyse_metadata(metadata_pizza_lettuce, ## the hrc_indicator column will be ignored. ``` -``` -## Error in components(g_full): impossible de trouver la fonction "components" -``` - ``` r # Output simplifié, uniquement le dataframe avec l'indicatrice de cluster cluster_id_dataframe <- analyse_metadata(metadata_pizza_lettuce, verbose = FALSE) @@ -358,51 +356,25 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf -## hrc_spanning_2 hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 ``` # Pour aller plus loin : visualiser les inclusions -L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, XXXXX est inclus dans XXXXX. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse. +L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, `T1` est inclus dans `T2`. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse. ``` r library(rtauargus) library(igraph) -``` - -``` -## -## Attachement du package : 'igraph' -``` - -``` -## Les objets suivants sont masqués depuis 'package:dplyr': -## -## as_data_frame, groups, union -``` - -``` -## Les objets suivants sont masqués depuis 'package:stats': -## -## decompose, spectrum -``` - -``` -## L'objet suivant est masqué depuis 'package:base': -## -## union -``` - -``` r library(visNetwork) graph_links_tab <- function(list_desc_links){ diff --git a/vignettes/auto_metadata_fr.Rmd.orig b/vignettes/auto_metadata_fr.Rmd.orig index f0c8159..ab17a82 100644 --- a/vignettes/auto_metadata_fr.Rmd.orig +++ b/vignettes/auto_metadata_fr.Rmd.orig @@ -6,17 +6,17 @@ output: toc: true toc_depth: 3 vignette: > -%\VignetteIndexEntry{Analyse automatique des métadonnées} + %\VignetteIndexEntry{Automatic analysis of metadata} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} - --- +--- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` -```{r message = FALSE} +```{r message = FALSE, warning = FALSE} library(rtauargus) library(dplyr) ``` @@ -212,9 +212,9 @@ cluster_id_dataframe ``` # Pour aller plus loin : visualiser les inclusions -L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, XXXXX est inclus dans XXXXX. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse. +L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, `T1` est inclus dans `T2`. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse. -```{r} +```{r message = FALSE, warning = FALSE} library(rtauargus) library(igraph) library(visNetwork) From dc22a03af4318d56d9764ebde38b54727987ba52 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Thu, 28 May 2026 16:53:34 +0200 Subject: [PATCH 04/38] =?UTF-8?q?fix:=20ajout=20hrc=5Ffield=20=3D=20NA=20p?= =?UTF-8?q?our=20=C3=A9viter=20erreur?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vignettes/auto_metadata_fr.Rmd | 52 ++++++++++------------------- vignettes/auto_metadata_fr.Rmd.orig | 3 +- 2 files changed, 20 insertions(+), 35 deletions(-) diff --git a/vignettes/auto_metadata_fr.Rmd b/vignettes/auto_metadata_fr.Rmd index 8ffc4b7..44ae776 100644 --- a/vignettes/auto_metadata_fr.Rmd +++ b/vignettes/auto_metadata_fr.Rmd @@ -163,8 +163,8 @@ names(detailed_analysis) ``` ``` -## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" "tab_to_treat" -## [8] "df_tab_to_treat" +## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" +## [7] "tab_to_treat" "df_tab_to_treat" ``` On retrouve le dataframe avec l'indicatrice du cluster `df_tab_to_treat`. On a le même résultat mais en format liste : chaque élément de la sous-liste `tab_to_treat` est un cluster indépendant. Les autres éléments de la liste sont les étapes de l'analyse. @@ -231,20 +231,21 @@ Ici, il n'y a pas de lien hiérarchique entre `SAL` (effectifs salariés des ent ``` r # cas où il n'y a aucune hiérarchie sur les indicateurs metadata_template <- template_formatted$metadata %>% - mutate(hrc_indicator = NA) %>% + mutate(hrc_field = NA, + hrc_indicator = NA) %>% select(table_name,field,indicator,hrc_indicator, everything()) metadata_template ``` ``` -## table_name field indicator hrc_indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 -## 1 table_2021_SAL_DTH_1 2021 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 2 table_2021_SAL_DTH_2 2021 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 -## 3 table_2022_SAL_1 2022 SAL NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 4 table_2022_SAL_2 2022 SAL NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 -## 5 table_2022_SAL_DTH_1 2022 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 6 table_2022_SAL_DTH_2 2022 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 +## table_name field indicator hrc_indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 hrc_field +## 1 table_2021_SAL_DTH_1 2021 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 2 table_2021_SAL_DTH_2 2021 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA +## 3 table_2022_SAL_1 2022 SAL NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 4 table_2022_SAL_2 2022 SAL NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA +## 5 table_2022_SAL_DTH_1 2022 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 6 table_2022_SAL_DTH_2 2022 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA ``` Ensuite, on utilise ce dataframe en input de la fonction d'analyse. @@ -253,39 +254,22 @@ Ensuite, on utilise ce dataframe en input de la fonction d'analyse. ``` r # Analyse complète, avec les étapes detailed_analysis <- analyse_metadata(metadata_template, verbose = TRUE) -``` -``` -## Error in `check_column_names()`: -## ! Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. -``` - -``` r # Output simplifié, uniquement le dataframe avec l'indicatrice de cluster cluster_id_dataframe <- analyse_metadata(metadata_template, verbose = FALSE) -``` -``` -## Error in `check_column_names()`: -## ! Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. -``` - -``` r # visualisation du résultat de l'analyse cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts -## hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 +## cluster table_name field indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 +## 1 2021.SAL_DTH table_2021_SAL_DTH_1 2021 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 2 2021.SAL_DTH table_2021_SAL_DTH_2 2021 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 +## 3 2022.SAL table_2022_SAL_1 2022 SAL HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 4 2022.SAL table_2022_SAL_2 2022 SAL HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 +## 5 2022.SAL_DTH table_2022_SAL_DTH_1 2022 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 6 2022.SAL_DTH table_2022_SAL_DTH_2 2022 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 ``` Finalement, il y a 6 tableaux à traiter dans 3 clusters différents. Autrement dit, il faudra faire trois fois appel à `tab_multi_manager()`. diff --git a/vignettes/auto_metadata_fr.Rmd.orig b/vignettes/auto_metadata_fr.Rmd.orig index ab17a82..b61d857 100644 --- a/vignettes/auto_metadata_fr.Rmd.orig +++ b/vignettes/auto_metadata_fr.Rmd.orig @@ -149,7 +149,8 @@ Ici, il n'y a pas de lien hiérarchique entre `SAL` (effectifs salariés des ent ```{r} # cas où il n'y a aucune hiérarchie sur les indicateurs metadata_template <- template_formatted$metadata %>% - mutate(hrc_indicator = NA) %>% + mutate(hrc_field = NA, + hrc_indicator = NA) %>% select(table_name,field,indicator,hrc_indicator, everything()) metadata_template From 9bca9dc60cb4fea6365e3befe0800042a3f978cb Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Thu, 28 May 2026 16:54:52 +0200 Subject: [PATCH 05/38] =?UTF-8?q?docs:=20m=C3=AAme=20fix=20que=20pour=20au?= =?UTF-8?q?to=5Fmetadata=5Ffr,=20sans=20le=20pbm=20de=20header?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vignettes/auto_metadata.Rmd | 146 +++++++++---------------------- vignettes/auto_metadata.Rmd.orig | 9 +- 2 files changed, 47 insertions(+), 108 deletions(-) diff --git a/vignettes/auto_metadata.Rmd b/vignettes/auto_metadata.Rmd index 42e101f..229215e 100644 --- a/vignettes/auto_metadata.Rmd +++ b/vignettes/auto_metadata.Rmd @@ -141,16 +141,16 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS -## hrc_spanning_1 hrc_spanning_2 hrc_spanning_3 -## 1 hrc_naf hrc_lettuce -## 2 hrc_naf hrc_lettuce -## 3 hrc_nuts -## 4 hrc_naf hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 ``` For the 12 tables to be published, it is sufficient to protect 4 tables. These tables are distributed across two different clusters. Therefore, `tab_multi_manager()` needs to be called twice. @@ -164,8 +164,8 @@ names(detailed_analysis) ``` ``` -## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" -## [6] "grp_tab_in_clusters" "tab_to_treat" "df_tab_to_treat" +## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" +## [7] "tab_to_treat" "df_tab_to_treat" ``` One finds the dataframe with the cluster indicator `df_tab_to_treat`. The result is the same but in list format: each element of the list is an independent cluster `tab_to_treat`. Additionally, the 6 steps of the analysis are included. @@ -232,20 +232,21 @@ Here, there is no hierarchical link between `SAL` (employees of active companies ``` r # cas où il n'y a aucune hiérarchie sur les indicateurs metadata_template <- template_formatted$metadata %>% - mutate(hrc_indicator = NA) %>% + mutate(hrc_field = NA, + hrc_indicator = NA) %>% select(table_name,field,indicator,hrc_indicator, everything()) metadata_template ``` ``` -## table_name field indicator hrc_indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 -## 1 table_2021_SAL_DTH_1 2021 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 2 table_2021_SAL_DTH_2 2021 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 -## 3 table_2022_SAL_1 2022 SAL NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 4 table_2022_SAL_2 2022 SAL NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 -## 5 table_2022_SAL_DTH_1 2022 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 6 table_2022_SAL_DTH_2 2022 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 +## table_name field indicator hrc_indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 hrc_field +## 1 table_2021_SAL_DTH_1 2021 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 2 table_2021_SAL_DTH_2 2021 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA +## 3 table_2022_SAL_1 2022 SAL NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 4 table_2022_SAL_2 2022 SAL NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA +## 5 table_2022_SAL_DTH_1 2022 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 6 table_2022_SAL_DTH_2 2022 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA ``` Next, this dataframe is used as input for the analysis function. @@ -254,39 +255,22 @@ Next, this dataframe is used as input for the analysis function. ``` r # Analyse complète, avec les étapes detailed_analysis <- analyse_metadata(metadata_template, verbose = TRUE) -``` -``` -## Error in `check_column_names()`: -## ! Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. -``` - -``` r # Output simplifié, uniquement le dataframe avec l'indicatrice de cluster cluster_id_dataframe <- analyse_metadata(metadata_template, verbose = FALSE) -``` -``` -## Error in `check_column_names()`: -## ! Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. -``` - -``` r # visualisation du résultat de l'analyse cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS -## hrc_spanning_1 hrc_spanning_2 hrc_spanning_3 -## 1 hrc_naf hrc_lettuce -## 2 hrc_naf hrc_lettuce -## 3 hrc_nuts -## 4 hrc_naf hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 +## 1 2021.SAL_DTH table_2021_SAL_DTH_1 2021 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 2 2021.SAL_DTH table_2021_SAL_DTH_2 2021 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 +## 3 2022.SAL table_2022_SAL_1 2022 SAL HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 4 2022.SAL table_2022_SAL_2 2022 SAL HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 +## 5 2022.SAL_DTH table_2022_SAL_DTH_1 2022 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 6 2022.SAL_DTH table_2022_SAL_DTH_2 2022 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 ``` Ultimately, there are 6 tables to process in 3 different clusters. In other words, `tab_multi_manager()` will need to be called three times. @@ -340,17 +324,6 @@ detailed_analysis <- analyse_metadata(metadata_pizza_lettuce, ## the hrc_indicator column will be ignored. ``` -``` -## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0. -## ℹ Please use `reframe()` instead. -## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust -## accordingly. -## ℹ The deprecated feature was likely used in the rtauargus package. -## Please report the issue at . -## This warning is displayed once per session. -## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated. -``` - ``` r # Simplified output, only the data frame with the cluster indicator cluster_id_dataframe <- analyse_metadata(metadata_pizza_lettuce, verbose = FALSE) @@ -368,62 +341,29 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS -## hrc_spanning_1 hrc_spanning_2 hrc_spanning_3 -## 1 hrc_naf hrc_lettuce -## 2 hrc_naf hrc_lettuce -## 3 hrc_nuts -## 4 hrc_naf hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 ``` # Going Further: Visualizing Inclusions -The create_edges step in the metadata analysis identifies tables included within other tables. For example, XXXXX is included in XXXXX. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. +The create_edges step in the metadata analysis identifies tables included within other tables. For example, `T1` is included in `T2`. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. ``` r library(rtauargus) library(igraph) -``` - -``` -## -## Attaching package: 'igraph' -``` - -``` -## The following objects are masked from 'package:dplyr': -## -## as_data_frame, groups, union -``` - -``` -## The following objects are masked from 'package:stats': -## -## decompose, spectrum -``` - -``` -## The following object is masked from 'package:base': -## -## union -``` - -``` r library(visNetwork) -``` -``` -## Error in `library()`: -## ! there is no package called 'visNetwork' -``` - -``` r graph_links_tab <- function(list_desc_links){ list_desc_links %>% purrr::imap(function(ss_dem,i){ if(!is.null(ss_dem)){ @@ -446,10 +386,8 @@ graph_links_tab(detailed_analysis$create_edges) ``` ``` -## Error in `map2()`: -## ℹ In index: 1. -## ℹ With name: france_entreprises_2023.hrc_lettuce. -## Caused by error in `visOptions()`: -## ! could not find function "visOptions" +## $france_entreprises_2023.hrc_lettuce +## +## $france_entreprises_2023.to_pizza ``` diff --git a/vignettes/auto_metadata.Rmd.orig b/vignettes/auto_metadata.Rmd.orig index e94ebf1..afba68a 100644 --- a/vignettes/auto_metadata.Rmd.orig +++ b/vignettes/auto_metadata.Rmd.orig @@ -16,7 +16,7 @@ vignette: > knitr::opts_chunk$set(echo = TRUE) ``` -```{r message = FALSE} +```{r message = FALSE, warning = FALSE} library(rtauargus) library(dplyr) ``` @@ -150,7 +150,8 @@ Here, there is no hierarchical link between `SAL` (employees of active companies ```{r} # cas où il n'y a aucune hiérarchie sur les indicateurs metadata_template <- template_formatted$metadata %>% - mutate(hrc_indicator = NA) %>% + mutate(hrc_field = NA, + hrc_indicator = NA) %>% select(table_name,field,indicator,hrc_indicator, everything()) metadata_template @@ -215,9 +216,9 @@ cluster_id_dataframe # Going Further: Visualizing Inclusions -The create_edges step in the metadata analysis identifies tables included within other tables. For example, XXXXX is included in XXXXX. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. +The create_edges step in the metadata analysis identifies tables included within other tables. For example, `T1` is included in `T2`. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. -```{r} +```{r message = FALSE, warning = FALSE} library(rtauargus) library(igraph) library(visNetwork) From e19c978d5ab2722d03e469164ad742cddbe846e4 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Mon, 11 May 2026 16:07:34 +0200 Subject: [PATCH 06/38] =?UTF-8?q?fix:=20dans=20le=20r=C3=A9sultat=20on=20a?= =?UTF-8?q?vait=20un=20tableau=20ventil=C3=A9=20plusieurs=20fois=20par=20l?= =?UTF-8?q?a=20m=C3=AAme=20variable=20de=20croisement.=20Cela=20=C3=A9tait?= =?UTF-8?q?=20d=C3=BB=20au=20fait=20que=20dans=20un=20m=C3=AAme=20groupe?= =?UTF-8?q?=20(var=20group)=20des=20variables=20de=20croisement=20sont=20r?= =?UTF-8?q?=C3=A9p=C3=A9t=C3=A9es.=20C'est=20attendu=20puisque=20les=20ind?= =?UTF-8?q?icateurs=20d'une=20m=C3=AAme=20=C3=A9quation=20sont=20normaleme?= =?UTF-8?q?nt=20ventil=C3=A9s=20par=20les=20m=C3=AAmes=20variables=20de=20?= =?UTF-8?q?croisement.=20L'ajout=20du=20distinct()=20permet=20de=20garder?= =?UTF-8?q?=20dans=20chaque=20groupe=20qu'une=20seule=20fois=20la=20variab?= =?UTF-8?q?le=20de=20croisement.=20Autrement=20dit,=20si=20une=20=C3=A9qua?= =?UTF-8?q?tion=20A=20=3D=20B=20+=20C=20d=C3=A9finit=20le=20group=201=20et?= =?UTF-8?q?=20que=20A,=20B=20et=20C=20sont=20ventil=C3=A9es=20par=20nb=5Fe?= =?UTF-8?q?mpl=20on=20n'a=20plus=20nb=5Fempl=20r=C3=A9p=C3=A9t=C3=A9=203?= =?UTF-8?q?=20fois=20mais=20seulement=201=20fois?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/identify_hrc_with_eq.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index ff22b0f..f9ecc67 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -200,7 +200,8 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ hrc_spanning = hrc_spanning, indicator = last(unit), hrc_indicator = last(hrc_indicator) - ) %>% unique() + ) %>% + distinct(group, spanning, hrc_spanning, .keep_all = TRUE) # 'df_eq_indicator_spannings' defines the spanning information for equation indicators. # Each equation name is transformed into its uppercase form with a "^h" suffix, From f9cb467745a502a9ebb04f0443869302182ab25e Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Mon, 11 May 2026 16:35:09 +0200 Subject: [PATCH 07/38] =?UTF-8?q?fix:=20m=C3=AAme=20chose=20que=20pour=20l?= =?UTF-8?q?e=20commit=20pr=C3=A9c=C3=A9dent=20(distinct()=20sur=20df=5Feq?= =?UTF-8?q?=5Finital=5Fspannings)=20mais=20cette=20fois-ci=20sur=20la=20pa?= =?UTF-8?q?rtie=20variables=20de=20croisement=20sur=20les=20indicateurs=20?= =?UTF-8?q?qui=20sont=20dans=20des=20=C3=A9quations=20(df=5Feq=5Findicator?= =?UTF-8?q?=5Fspannings)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/identify_hrc_with_eq.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index f9ecc67..83dd55c 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -201,7 +201,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ indicator = last(unit), hrc_indicator = last(hrc_indicator) ) %>% - distinct(group, spanning, hrc_spanning, .keep_all = TRUE) + dplyr::distinct(group, spanning, hrc_spanning, .keep_all = TRUE) # 'df_eq_indicator_spannings' defines the spanning information for equation indicators. # Each equation name is transformed into its uppercase form with a "^h" suffix, @@ -226,7 +226,8 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ indicator = last(unit), hrc_indicator = last(hrc_indicator), .groups = "drop" - ) + ) %>% + dplyr::distinct(group, spanning, hrc_spanning, .keep_all = TRUE) # 'df_indicators' combines both initial and indicator spanning information # into a single harmonized dataset, keeping key structural columns From 16efce4d509b16e90184959c9d52429efa2f0a23 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Thu, 28 May 2026 16:50:04 +0200 Subject: [PATCH 08/38] docs: change the header to fix unknown title + hide warnings and correct XXXXX --- vignettes/auto_metadata_fr.Rmd | 106 ++++++++++------------------ vignettes/auto_metadata_fr.Rmd.orig | 10 +-- 2 files changed, 44 insertions(+), 72 deletions(-) diff --git a/vignettes/auto_metadata_fr.Rmd b/vignettes/auto_metadata_fr.Rmd index d98f6dd..8ffc4b7 100644 --- a/vignettes/auto_metadata_fr.Rmd +++ b/vignettes/auto_metadata_fr.Rmd @@ -6,10 +6,10 @@ output: toc: true toc_depth: 3 vignette: > -%\VignetteIndexEntry{Analyse automatique des métadonnées} + %\VignetteIndexEntry{Automatic analysis of metadata} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} - --- +--- @@ -140,16 +140,16 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf -## hrc_spanning_2 hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 ``` Pour les 12 tableaux à publier il suffit de protéger 4 tableaux. Ces tableaux sont repartis dans deux clusters différents. Il faudra donc faire appel deux fois à `tab_multi_manager()`. @@ -163,8 +163,8 @@ names(detailed_analysis) ``` ``` -## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" -## [7] "tab_to_treat" "df_tab_to_treat" +## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" "tab_to_treat" +## [8] "df_tab_to_treat" ``` On retrouve le dataframe avec l'indicatrice du cluster `df_tab_to_treat`. On a le même résultat mais en format liste : chaque élément de la sous-liste `tab_to_treat` est un cluster indépendant. Les autres éléments de la liste sont les étapes de l'analyse. @@ -256,7 +256,8 @@ detailed_analysis <- analyse_metadata(metadata_template, verbose = TRUE) ``` ``` -## Error in check_column_names(df_metadata): Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. +## Error in `check_column_names()`: +## ! Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. ``` ``` r @@ -265,7 +266,8 @@ cluster_id_dataframe <- analyse_metadata(metadata_template, verbose = FALSE) ``` ``` -## Error in check_column_names(df_metadata): Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. +## Error in `check_column_names()`: +## ! Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. ``` ``` r @@ -274,16 +276,16 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf -## hrc_spanning_2 hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 ``` Finalement, il y a 6 tableaux à traiter dans 3 clusters différents. Autrement dit, il faudra faire trois fois appel à `tab_multi_manager()`. @@ -336,10 +338,6 @@ detailed_analysis <- analyse_metadata(metadata_pizza_lettuce, ## the hrc_indicator column will be ignored. ``` -``` -## Error in components(g_full): impossible de trouver la fonction "components" -``` - ``` r # Output simplifié, uniquement le dataframe avec l'indicatrice de cluster cluster_id_dataframe <- analyse_metadata(metadata_pizza_lettuce, verbose = FALSE) @@ -358,51 +356,25 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf -## hrc_spanning_2 hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 ``` # Pour aller plus loin : visualiser les inclusions -L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, XXXXX est inclus dans XXXXX. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse. +L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, `T1` est inclus dans `T2`. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse. ``` r library(rtauargus) library(igraph) -``` - -``` -## -## Attachement du package : 'igraph' -``` - -``` -## Les objets suivants sont masqués depuis 'package:dplyr': -## -## as_data_frame, groups, union -``` - -``` -## Les objets suivants sont masqués depuis 'package:stats': -## -## decompose, spectrum -``` - -``` -## L'objet suivant est masqué depuis 'package:base': -## -## union -``` - -``` r library(visNetwork) graph_links_tab <- function(list_desc_links){ diff --git a/vignettes/auto_metadata_fr.Rmd.orig b/vignettes/auto_metadata_fr.Rmd.orig index f0c8159..ab17a82 100644 --- a/vignettes/auto_metadata_fr.Rmd.orig +++ b/vignettes/auto_metadata_fr.Rmd.orig @@ -6,17 +6,17 @@ output: toc: true toc_depth: 3 vignette: > -%\VignetteIndexEntry{Analyse automatique des métadonnées} + %\VignetteIndexEntry{Automatic analysis of metadata} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} - --- +--- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` -```{r message = FALSE} +```{r message = FALSE, warning = FALSE} library(rtauargus) library(dplyr) ``` @@ -212,9 +212,9 @@ cluster_id_dataframe ``` # Pour aller plus loin : visualiser les inclusions -L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, XXXXX est inclus dans XXXXX. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse. +L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, `T1` est inclus dans `T2`. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse. -```{r} +```{r message = FALSE, warning = FALSE} library(rtauargus) library(igraph) library(visNetwork) From 32137eb8bc2cff4cdc2deb37b9631c16b766e0f1 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Thu, 28 May 2026 16:53:34 +0200 Subject: [PATCH 09/38] =?UTF-8?q?fix:=20ajout=20hrc=5Ffield=20=3D=20NA=20p?= =?UTF-8?q?our=20=C3=A9viter=20erreur?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vignettes/auto_metadata_fr.Rmd | 52 ++++++++++------------------- vignettes/auto_metadata_fr.Rmd.orig | 3 +- 2 files changed, 20 insertions(+), 35 deletions(-) diff --git a/vignettes/auto_metadata_fr.Rmd b/vignettes/auto_metadata_fr.Rmd index 8ffc4b7..44ae776 100644 --- a/vignettes/auto_metadata_fr.Rmd +++ b/vignettes/auto_metadata_fr.Rmd @@ -163,8 +163,8 @@ names(detailed_analysis) ``` ``` -## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" "tab_to_treat" -## [8] "df_tab_to_treat" +## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" +## [7] "tab_to_treat" "df_tab_to_treat" ``` On retrouve le dataframe avec l'indicatrice du cluster `df_tab_to_treat`. On a le même résultat mais en format liste : chaque élément de la sous-liste `tab_to_treat` est un cluster indépendant. Les autres éléments de la liste sont les étapes de l'analyse. @@ -231,20 +231,21 @@ Ici, il n'y a pas de lien hiérarchique entre `SAL` (effectifs salariés des ent ``` r # cas où il n'y a aucune hiérarchie sur les indicateurs metadata_template <- template_formatted$metadata %>% - mutate(hrc_indicator = NA) %>% + mutate(hrc_field = NA, + hrc_indicator = NA) %>% select(table_name,field,indicator,hrc_indicator, everything()) metadata_template ``` ``` -## table_name field indicator hrc_indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 -## 1 table_2021_SAL_DTH_1 2021 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 2 table_2021_SAL_DTH_2 2021 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 -## 3 table_2022_SAL_1 2022 SAL NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 4 table_2022_SAL_2 2022 SAL NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 -## 5 table_2022_SAL_DTH_1 2022 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 6 table_2022_SAL_DTH_2 2022 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 +## table_name field indicator hrc_indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 hrc_field +## 1 table_2021_SAL_DTH_1 2021 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 2 table_2021_SAL_DTH_2 2021 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA +## 3 table_2022_SAL_1 2022 SAL NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 4 table_2022_SAL_2 2022 SAL NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA +## 5 table_2022_SAL_DTH_1 2022 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 6 table_2022_SAL_DTH_2 2022 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA ``` Ensuite, on utilise ce dataframe en input de la fonction d'analyse. @@ -253,39 +254,22 @@ Ensuite, on utilise ce dataframe en input de la fonction d'analyse. ``` r # Analyse complète, avec les étapes detailed_analysis <- analyse_metadata(metadata_template, verbose = TRUE) -``` -``` -## Error in `check_column_names()`: -## ! Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. -``` - -``` r # Output simplifié, uniquement le dataframe avec l'indicatrice de cluster cluster_id_dataframe <- analyse_metadata(metadata_template, verbose = FALSE) -``` -``` -## Error in `check_column_names()`: -## ! Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. -``` - -``` r # visualisation du résultat de l'analyse cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts -## hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 +## cluster table_name field indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 +## 1 2021.SAL_DTH table_2021_SAL_DTH_1 2021 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 2 2021.SAL_DTH table_2021_SAL_DTH_2 2021 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 +## 3 2022.SAL table_2022_SAL_1 2022 SAL HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 4 2022.SAL table_2022_SAL_2 2022 SAL HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 +## 5 2022.SAL_DTH table_2022_SAL_DTH_1 2022 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 6 2022.SAL_DTH table_2022_SAL_DTH_2 2022 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 ``` Finalement, il y a 6 tableaux à traiter dans 3 clusters différents. Autrement dit, il faudra faire trois fois appel à `tab_multi_manager()`. diff --git a/vignettes/auto_metadata_fr.Rmd.orig b/vignettes/auto_metadata_fr.Rmd.orig index ab17a82..b61d857 100644 --- a/vignettes/auto_metadata_fr.Rmd.orig +++ b/vignettes/auto_metadata_fr.Rmd.orig @@ -149,7 +149,8 @@ Ici, il n'y a pas de lien hiérarchique entre `SAL` (effectifs salariés des ent ```{r} # cas où il n'y a aucune hiérarchie sur les indicateurs metadata_template <- template_formatted$metadata %>% - mutate(hrc_indicator = NA) %>% + mutate(hrc_field = NA, + hrc_indicator = NA) %>% select(table_name,field,indicator,hrc_indicator, everything()) metadata_template From fd9add36d7a4b345423b26531eba036573ff31c7 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Thu, 28 May 2026 16:54:52 +0200 Subject: [PATCH 10/38] =?UTF-8?q?docs:=20m=C3=AAme=20fix=20que=20pour=20au?= =?UTF-8?q?to=5Fmetadata=5Ffr,=20sans=20le=20pbm=20de=20header?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vignettes/auto_metadata.Rmd | 146 +++++++++---------------------- vignettes/auto_metadata.Rmd.orig | 9 +- 2 files changed, 47 insertions(+), 108 deletions(-) diff --git a/vignettes/auto_metadata.Rmd b/vignettes/auto_metadata.Rmd index 42e101f..229215e 100644 --- a/vignettes/auto_metadata.Rmd +++ b/vignettes/auto_metadata.Rmd @@ -141,16 +141,16 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS -## hrc_spanning_1 hrc_spanning_2 hrc_spanning_3 -## 1 hrc_naf hrc_lettuce -## 2 hrc_naf hrc_lettuce -## 3 hrc_nuts -## 4 hrc_naf hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 ``` For the 12 tables to be published, it is sufficient to protect 4 tables. These tables are distributed across two different clusters. Therefore, `tab_multi_manager()` needs to be called twice. @@ -164,8 +164,8 @@ names(detailed_analysis) ``` ``` -## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" -## [6] "grp_tab_in_clusters" "tab_to_treat" "df_tab_to_treat" +## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" +## [7] "tab_to_treat" "df_tab_to_treat" ``` One finds the dataframe with the cluster indicator `df_tab_to_treat`. The result is the same but in list format: each element of the list is an independent cluster `tab_to_treat`. Additionally, the 6 steps of the analysis are included. @@ -232,20 +232,21 @@ Here, there is no hierarchical link between `SAL` (employees of active companies ``` r # cas où il n'y a aucune hiérarchie sur les indicateurs metadata_template <- template_formatted$metadata %>% - mutate(hrc_indicator = NA) %>% + mutate(hrc_field = NA, + hrc_indicator = NA) %>% select(table_name,field,indicator,hrc_indicator, everything()) metadata_template ``` ``` -## table_name field indicator hrc_indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 -## 1 table_2021_SAL_DTH_1 2021 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 2 table_2021_SAL_DTH_2 2021 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 -## 3 table_2022_SAL_1 2022 SAL NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 4 table_2022_SAL_2 2022 SAL NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 -## 5 table_2022_SAL_DTH_1 2022 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 -## 6 table_2022_SAL_DTH_2 2022 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 +## table_name field indicator hrc_indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 hrc_field +## 1 table_2021_SAL_DTH_1 2021 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 2 table_2021_SAL_DTH_2 2021 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA +## 3 table_2022_SAL_1 2022 SAL NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 4 table_2022_SAL_2 2022 SAL NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA +## 5 table_2022_SAL_DTH_1 2022 SAL_DTH NA ACTIVITY LEGAL_FORM hrc_activity_131 hrc_legal_form_3 NA +## 6 table_2022_SAL_DTH_2 2022 SAL_DTH NA ACTIVITY NUMBER_EMPL hrc_activity_131 hrc_number_empl_4 NA ``` Next, this dataframe is used as input for the analysis function. @@ -254,39 +255,22 @@ Next, this dataframe is used as input for the analysis function. ``` r # Analyse complète, avec les étapes detailed_analysis <- analyse_metadata(metadata_template, verbose = TRUE) -``` -``` -## Error in `check_column_names()`: -## ! Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. -``` - -``` r # Output simplifié, uniquement le dataframe avec l'indicatrice de cluster cluster_id_dataframe <- analyse_metadata(metadata_template, verbose = FALSE) -``` -``` -## Error in `check_column_names()`: -## ! Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. -``` - -``` r # visualisation du résultat de l'analyse cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS -## hrc_spanning_1 hrc_spanning_2 hrc_spanning_3 -## 1 hrc_naf hrc_lettuce -## 2 hrc_naf hrc_lettuce -## 3 hrc_nuts -## 4 hrc_naf hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 +## 1 2021.SAL_DTH table_2021_SAL_DTH_1 2021 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 2 2021.SAL_DTH table_2021_SAL_DTH_2 2021 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 +## 3 2022.SAL table_2022_SAL_1 2022 SAL HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 4 2022.SAL table_2022_SAL_2 2022 SAL HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 +## 5 2022.SAL_DTH table_2022_SAL_DTH_1 2022 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 +## 6 2022.SAL_DTH table_2022_SAL_DTH_2 2022 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 ``` Ultimately, there are 6 tables to process in 3 different clusters. In other words, `tab_multi_manager()` will need to be called three times. @@ -340,17 +324,6 @@ detailed_analysis <- analyse_metadata(metadata_pizza_lettuce, ## the hrc_indicator column will be ignored. ``` -``` -## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0. -## ℹ Please use `reframe()` instead. -## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust -## accordingly. -## ℹ The deprecated feature was likely used in the rtauargus package. -## Please report the issue at . -## This warning is displayed once per session. -## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated. -``` - ``` r # Simplified output, only the data frame with the cluster indicator cluster_id_dataframe <- analyse_metadata(metadata_pizza_lettuce, verbose = FALSE) @@ -368,62 +341,29 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS -## hrc_spanning_1 hrc_spanning_2 hrc_spanning_3 -## 1 hrc_naf hrc_lettuce -## 2 hrc_naf hrc_lettuce -## 3 hrc_nuts -## 4 hrc_naf hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 ``` # Going Further: Visualizing Inclusions -The create_edges step in the metadata analysis identifies tables included within other tables. For example, XXXXX is included in XXXXX. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. +The create_edges step in the metadata analysis identifies tables included within other tables. For example, `T1` is included in `T2`. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. ``` r library(rtauargus) library(igraph) -``` - -``` -## -## Attaching package: 'igraph' -``` - -``` -## The following objects are masked from 'package:dplyr': -## -## as_data_frame, groups, union -``` - -``` -## The following objects are masked from 'package:stats': -## -## decompose, spectrum -``` - -``` -## The following object is masked from 'package:base': -## -## union -``` - -``` r library(visNetwork) -``` -``` -## Error in `library()`: -## ! there is no package called 'visNetwork' -``` - -``` r graph_links_tab <- function(list_desc_links){ list_desc_links %>% purrr::imap(function(ss_dem,i){ if(!is.null(ss_dem)){ @@ -446,10 +386,8 @@ graph_links_tab(detailed_analysis$create_edges) ``` ``` -## Error in `map2()`: -## ℹ In index: 1. -## ℹ With name: france_entreprises_2023.hrc_lettuce. -## Caused by error in `visOptions()`: -## ! could not find function "visOptions" +## $france_entreprises_2023.hrc_lettuce +## +## $france_entreprises_2023.to_pizza ``` diff --git a/vignettes/auto_metadata.Rmd.orig b/vignettes/auto_metadata.Rmd.orig index e94ebf1..afba68a 100644 --- a/vignettes/auto_metadata.Rmd.orig +++ b/vignettes/auto_metadata.Rmd.orig @@ -16,7 +16,7 @@ vignette: > knitr::opts_chunk$set(echo = TRUE) ``` -```{r message = FALSE} +```{r message = FALSE, warning = FALSE} library(rtauargus) library(dplyr) ``` @@ -150,7 +150,8 @@ Here, there is no hierarchical link between `SAL` (employees of active companies ```{r} # cas où il n'y a aucune hiérarchie sur les indicateurs metadata_template <- template_formatted$metadata %>% - mutate(hrc_indicator = NA) %>% + mutate(hrc_field = NA, + hrc_indicator = NA) %>% select(table_name,field,indicator,hrc_indicator, everything()) metadata_template @@ -215,9 +216,9 @@ cluster_id_dataframe # Going Further: Visualizing Inclusions -The create_edges step in the metadata analysis identifies tables included within other tables. For example, XXXXX is included in XXXXX. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. +The create_edges step in the metadata analysis identifies tables included within other tables. For example, `T1` is included in `T2`. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. -```{r} +```{r message = FALSE, warning = FALSE} library(rtauargus) library(igraph) library(visNetwork) From 5590c16b44ed814d5dbcbadf25366f3a2f4d2221 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Tue, 16 Jun 2026 10:30:12 +0200 Subject: [PATCH 11/38] feat: treat tables linked by equation but not broken down by the same spanning variables --- R/identify_hrc_with_eq.R | 89 +++++++++++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 19 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 83dd55c..f5107f4 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -98,11 +98,11 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ ) %>% dplyr::distinct() - # ---- 1) Identify ambiguous totals ---- + # Identify ambiguous totals total_counts <- parsed_equations %>% dplyr::count(total, name = "n_total") ambiguous_totals <- total_counts %>% dplyr::filter(n_total > 1) %>% pull(total) - # ---- 2) Build a total -> total_alt mapping by eq_name ---- + # Build a total -> total_alt mapping by eq_name # For all equations (ambiguous or not), create one row; # for non-ambiguous totals, total_alt == total alt_map <- parsed_equations %>% @@ -119,7 +119,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ dplyr::ungroup() %>% dplyr::select(eq_name, total, total_alt) - # ---- 3) Apply the mapping to the links ---- + # Apply the mapping to the links # 'links' contains total, rhs, eq_name (if not, it must be joined beforehand) # here we assume links has an eq_name column; otherwise do # left_join(links, parsed_equations %>% select(eq_name, total, rhs), ...) first @@ -138,14 +138,19 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ select(total, rhs, eq_name) %>% dplyr::distinct() - # ---- 4) Build the full graph (including all copies) ---- + # Build the full graph (including all copies) g_full <- graph_from_data_frame(links_full %>% select(total, rhs), directed = TRUE) - # ---- 5) Compute components on g_full ---- + # Compute components on g_full comp_full <- igraph::components(g_full)$membership comp_df <- data.frame(var = names(comp_full), group = as.integer(comp_full), stringsAsFactors = FALSE) - # ---- 6) Update equations_long: + ############################################################################## + # browser() # use this combined with "./rtauargus/dev/graphes_equations_objet_browser.R" + # to get the graphs showing indicators links based on the equations + ############################################################################## + + # Update equations_long: # associate the alternative variable (if present) and the corresponding group ---- # Notes: # - equations_long contains the original variables (var) and eq_name; @@ -189,19 +194,65 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ mutate(across(dplyr::where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% left_join(equations_long_full, by = c("indicator" = "var")) - df_eq_initial_spannings <- df_spannings_eq %>% - filter(!is.na(eq_name)) %>% + # For each table, retrieve its spannings + spanning_par_table <- df_spannings_eq %>% + distinct(table_name, spanning, group) + + # Identify spannings shared within each group + spannings_communs <- spanning_par_table %>% + group_by(group, spanning) %>% + summarise( + tables_avec_ce_spanning = list(sort(unique(table_name))), + n_tables = n(), + .groups = "drop" + ) + + # Total number of tables per group + n_tables_par_group <- spanning_par_table %>% group_by(group) %>% - dplyr::reframe( - table_name = paste(unique(table_name), collapse = "."), - field = last(field), - hrc_field = last(hrc_field), - spanning = spanning, - hrc_spanning = hrc_spanning, - indicator = last(unit), - hrc_indicator = last(hrc_indicator) + summarise(n_total = n_distinct(table_name), .groups = "drop") + + spannings_communs <- spannings_communs %>% + left_join(n_tables_par_group, by = "group") + + # Spannings shared by all tables in the group + spanning_all <- spannings_communs %>% + filter(n_tables == n_total) # spanning present in every table + + # Tables with additional spannings (not shared by all tables) + spanning_extra <- spannings_communs %>% + filter(n_tables < n_total) %>% + tidyr::unnest(tables_avec_ce_spanning) %>% + rename(table_name = tables_avec_ce_spanning) + + # Build the merged group + # (all tables in the group + all spannings shared by every table) + df_groupe <- df_spannings_eq %>% + group_by(across(-c(table_name,side,var_mapped,indicator))) %>% + summarise( + table_name = paste(sort(unique(table_name)), collapse = "."), + indicator = last(unit), + .groups = "drop" ) %>% - dplyr::distinct(group, spanning, hrc_spanning, .keep_all = TRUE) + filter(spanning %in% spanning_all$spanning) + + # Build standalone rows for tables with extra spannings, keep the table on its + # own with all of its spannings + tables_extra <- spanning_extra %>% + distinct(table_name, group) + + df_solo <- df_spannings_eq %>% + semi_join(tables_extra, by = c("table_name", "group")) %>% + # Use the same indicator name as the other tables that share at least + # one cross-classification variable, so that all tables are processed + # within the same cluster. + # Rule for the next steps: the minimum condition for creating a cluster + # is that the tables share the same indicator. + mutate(initial_indicator = indicator, + indicator = unit) + + df_eq_initial_spannings <- bind_rows(df_groupe, df_solo %>% select(-c(side,var_mapped))) %>% + arrange(group, table_name, spanning) # 'df_eq_indicator_spannings' defines the spanning information for equation indicators. # Each equation name is transformed into its uppercase form with a "^h" suffix, @@ -219,9 +270,9 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ paste0(toupper(last(eq_name)), "^h") }, hrc_spanning = if(length(unique(eq_name)) > 1) { - paste0("hrc_", paste0(unique(toupper(eq_name)), collapse = "_")) + paste0("hrc_", paste0(unique(toupper(eq_name)), collapse = "_"),".totcode.",var_mapped[side == "total"][1]) } else { - paste0("hrc_", toupper(last(eq_name))) + paste0("hrc_", toupper(last(eq_name)),".totcode.",var_mapped[side == "total"][1]) }, indicator = last(unit), hrc_indicator = last(hrc_indicator), From 245936ce582256dfc09a1ab9ca693d019867ab49 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Tue, 16 Jun 2026 10:31:26 +0200 Subject: [PATCH 12/38] feat: change indicator into initial_indicator for final output to help user know how to build the tables --- R/analyse_metadata.R | 2 +- R/tab_to_treat.R | 32 +++++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/R/analyse_metadata.R b/R/analyse_metadata.R index ec155a2..d4dd6de 100644 --- a/R/analyse_metadata.R +++ b/R/analyse_metadata.R @@ -117,7 +117,7 @@ analyse_metadata <- function(df_metadata,df_eq_indicator = NULL,verbose = FALSE) list_independent_tables <- grp_tab_in_cluster(list_split = list_split, list_translation_tables = list_translation_tables) list_cluster_treat <- tab_to_treat(list_independent_tables) - dataframe_cluster_id <- dataframe_result(list_cluster_treat) + dataframe_cluster_id <- dataframe_result(list_cluster_treat,list_hrc_identified) if(verbose){ return(list( identify_hrc = list_hrc_identified[[1]], diff --git a/R/tab_to_treat.R b/R/tab_to_treat.R index b8f1d28..0d34bd6 100644 --- a/R/tab_to_treat.R +++ b/R/tab_to_treat.R @@ -130,24 +130,42 @@ tab_to_treat <- function(list_independent_tables) { #' } #' #' @importFrom purrr imap_dfr -dataframe_result <- function(list_independent_tables) { - # TODO modifier car il y a une erreur (column field doesn't exist) - # Combine the list of tibbles into a single dataframe with cluster identifiers +dataframe_result <- function(list_independent_tables, list_hrc_identified) { dataframe_metadata <- purrr::imap_dfr(list_independent_tables, function(tibble, tibble_name) { tibble %>% mutate(cluster = tibble_name) - }) %>% + }) + + # If the initial_indicator column exists in list_hrc_identified, + # replace indicator with initial_indicator whenever initial_indicator is not NA + if ("initial_indicator" %in% names(list_hrc_identified[[1]])) { + hrc_indicator_map <- list_hrc_identified %>% + purrr::map_dfr(identity) %>% + filter(!is.na(initial_indicator)) %>% + select(table_name, field, indicator, initial_indicator) %>% + distinct() + + dataframe_metadata <- dataframe_metadata %>% + left_join(hrc_indicator_map, by = c("table_name", "field", "indicator")) %>% + mutate(indicator = if_else(!is.na(initial_indicator), initial_indicator, indicator)) %>% + select(-initial_indicator) + } + + dataframe_metadata <- dataframe_metadata %>% select( cluster, table_name, field, indicator, - # Dynamically order columns spanning_xxx by their numeric suffix all_of(names(.)[grepl("^spanning_\\d+$", names(.))] %>% .[order(as.numeric(sub("spanning_", "", .)))]), - # Dynamically order columns hrc_spanning_xxx by their numeric suffix all_of(names(.)[grepl("^hrc_spanning_\\d+$", names(.))] %>% .[order(as.numeric(sub("hrc_spanning_", "", .)))]) - ) %>% as.data.frame() + ) %>% + as.data.frame() + + return(dataframe_metadata) } + + From 86e385fe50e31237cc11e3e7558e874614ecb9e6 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Tue, 16 Jun 2026 14:39:40 +0200 Subject: [PATCH 13/38] fix: match eq to metadata --- tests/testthat/test_analyse_metadata.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_analyse_metadata.R b/tests/testthat/test_analyse_metadata.R index a075c40..75340a8 100644 --- a/tests/testthat/test_analyse_metadata.R +++ b/tests/testthat/test_analyse_metadata.R @@ -188,7 +188,7 @@ answer <- data.frame( test_that("indicators equation", { df_eq_ex <- data.frame( eq_name = c("eq1"), - eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + eq_indicator = c("to_lettuce = to_batavia + to_arugula"), unit = c("EUR"), stringsAsFactors = FALSE ) From 04d6d066c75b876c018242720d86ec361733d7e7 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Tue, 16 Jun 2026 14:50:10 +0200 Subject: [PATCH 14/38] fix: don't look at hrc_indic when indic part of df_eq_indicator --- R/identify_hrc_with_eq.R | 171 ++++++++++++++++++++++----------------- 1 file changed, 99 insertions(+), 72 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index f5107f4..a088255 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -72,17 +72,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ dplyr::ungroup() %>% dplyr::select(eq_name, unit, total, everything()) - # change to long format in order to join with df_metadata_long - equations_long <- parsed_equations %>% - mutate(across(c(total, starts_with("rhs")), trimws)) %>% - tidyr::pivot_longer( - cols = c(total, starts_with("rhs")), - names_to = "side", # côté équation (total / rhs1 / rhs2...) - values_to = "var" - ) %>% - filter(!is.na(var)) - - # Identify chained equations (A = B + C, B = D + E → group both equations together) + # Identify chained equations (A = B + C, B = D + E) and group equations together # Build dependency links between totals and rhs links <- parsed_equations %>% @@ -110,11 +100,11 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ dplyr::group_by(total) %>% dplyr::arrange(eq_name) %>% # ordre stable dplyr::mutate(alt_idx = dplyr::row_number(), - total_alt = dplyr::case_when( - dplyr::n() == 1 ~ total, - alt_idx == 1 ~ total, - TRUE ~ paste0(total, "_alt", alt_idx - 1) - ) + total_alt = dplyr::case_when( + dplyr::n() == 1 ~ total, + alt_idx == 1 ~ total, + TRUE ~ paste0(total, "_alt", alt_idx - 1) + ) ) %>% dplyr::ungroup() %>% dplyr::select(eq_name, total, total_alt) @@ -150,8 +140,18 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # to get the graphs showing indicators links based on the equations ############################################################################## + # reformat parsed_equations in long format in order to join with df_metadata_long + equations_long <- parsed_equations %>% + mutate(across(c(total, starts_with("rhs")), trimws)) %>% + tidyr::pivot_longer( + cols = c(total, starts_with("rhs")), + names_to = "side", # côté équation (total / rhs1 / rhs2...) + values_to = "var" + ) %>% + filter(!is.na(var)) + # Update equations_long: - # associate the alternative variable (if present) and the corresponding group ---- + # associate the alternative variable (if present) and the corresponding group # Notes: # - equations_long contains the original variables (var) and eq_name; # - we want to recover the "var" or "var_alt" version used in g_full. @@ -168,15 +168,18 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # 'df_spannings' is a modified version of 'df_metadata_long' where: # - 'spanning' is replaced by its uppercase hierarchical version if available, # - 'indicator' is replaced by its uppercase hierarchical version - # (without the 'hrc_' prefix) if available. + # (without the 'hrc_' prefix) if available and 'indicator' not part of 'df_eq_indicator' + browser() + indic_not_in_eq <- setdiff(unique(df_metadata_long$indicator),unique(equations_long$var)) + df_spannings <- df_metadata_long %>% mutate(spanning_old = spanning) %>% mutate(spanning = ifelse(is.na(hrc_spanning), spanning, toupper(hrc_spanning))) %>% - mutate(indicator = ifelse(is.na(hrc_indicator), - indicator, - toupper(sub("hrc_","",hrc_indicator)))) + mutate(indicator = ifelse(indicator %in% indic_not_in_eq & !is.na(hrc_indicator), + toupper(sub("hrc_","",hrc_indicator)), + indicator)) # 'df_variable_info' is a reference table linking original spanning names ('spanning_old') # to their transformed counterparts ('spanning'), along with the corresponding table name. @@ -194,65 +197,89 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ mutate(across(dplyr::where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% left_join(equations_long_full, by = c("indicator" = "var")) - # For each table, retrieve its spannings - spanning_par_table <- df_spannings_eq %>% - distinct(table_name, spanning, group) - # Identify spannings shared within each group - spannings_communs <- spanning_par_table %>% - group_by(group, spanning) %>% - summarise( - tables_avec_ce_spanning = list(sort(unique(table_name))), - n_tables = n(), - .groups = "drop" - ) + ############################################################################## + # Séparer les lignes avec et sans group + df_with_group <- df_spannings_eq %>% filter(!is.na(group)) + df_without_group <- df_spannings_eq %>% filter(is.na(group)) - # Total number of tables per group - n_tables_par_group <- spanning_par_table %>% + # ---- Traitement des lignes SANS group (ancien code) ---- + df_eq_initial_spannings_no_group <- df_without_group %>% + filter(!is.na(eq_name)) %>% group_by(group) %>% - summarise(n_total = n_distinct(table_name), .groups = "drop") + dplyr::reframe( + table_name = paste(unique(table_name), collapse = "."), + field = last(field), + hrc_field = last(hrc_field), + spanning = spanning, + hrc_spanning = hrc_spanning, + indicator = last(unit), + hrc_indicator = last(hrc_indicator) + ) %>% unique() - spannings_communs <- spannings_communs %>% - left_join(n_tables_par_group, by = "group") + # ---- Traitement des lignes AVEC group (nouveau code) ---- + if(nrow(df_with_group) > 0){ + spanning_by_table <- df_with_group %>% + distinct(table_name, spanning, group) - # Spannings shared by all tables in the group - spanning_all <- spannings_communs %>% - filter(n_tables == n_total) # spanning present in every table + common_spannings <- spanning_by_table %>% + group_by(group, spanning) %>% + summarise( + tables_avec_ce_spanning = list(sort(unique(table_name))), + n_tables = n(), + .groups = "drop" + ) - # Tables with additional spannings (not shared by all tables) - spanning_extra <- spannings_communs %>% - filter(n_tables < n_total) %>% - tidyr::unnest(tables_avec_ce_spanning) %>% - rename(table_name = tables_avec_ce_spanning) + n_tables_par_group <- df_with_group %>% + group_by(group) %>% + summarise(n_total = n_distinct(table_name), .groups = "drop") - # Build the merged group - # (all tables in the group + all spannings shared by every table) - df_groupe <- df_spannings_eq %>% - group_by(across(-c(table_name,side,var_mapped,indicator))) %>% - summarise( - table_name = paste(sort(unique(table_name)), collapse = "."), - indicator = last(unit), - .groups = "drop" + common_spannings <- common_spannings %>% + left_join(n_tables_par_group, by = "group") + + spanning_all <- common_spannings %>% + filter(n_tables == n_total) + + spanning_extra <- common_spannings %>% + filter(n_tables < n_total) %>% + tidyr::unnest(tables_avec_ce_spanning) %>% + rename(table_name = tables_avec_ce_spanning) + + df_groupe <- df_with_group %>% + group_by(across(-c(table_name, side, var_mapped, indicator))) %>% + summarise( + table_name = paste(sort(unique(table_name)), collapse = "."), + indicator = last(unit), + .groups = "drop" + ) %>% + filter(spanning %in% spanning_all$spanning) + + tables_extra <- spanning_extra %>% + distinct(table_name, group) + + df_solo <- df_with_group %>% + semi_join(tables_extra, by = c("table_name", "group")) %>% + mutate( + initial_indicator = var_mapped[side == "total"][1], + indicator = unit + ) + + df_eq_initial_spannings_with_group <- bind_rows( + df_groupe, + df_solo %>% select(-c(side, var_mapped)) ) %>% - filter(spanning %in% spanning_all$spanning) - - # Build standalone rows for tables with extra spannings, keep the table on its - # own with all of its spannings - tables_extra <- spanning_extra %>% - distinct(table_name, group) - - df_solo <- df_spannings_eq %>% - semi_join(tables_extra, by = c("table_name", "group")) %>% - # Use the same indicator name as the other tables that share at least - # one cross-classification variable, so that all tables are processed - # within the same cluster. - # Rule for the next steps: the minimum condition for creating a cluster - # is that the tables share the same indicator. - mutate(initial_indicator = indicator, - indicator = unit) - - df_eq_initial_spannings <- bind_rows(df_groupe, df_solo %>% select(-c(side,var_mapped))) %>% - arrange(group, table_name, spanning) + arrange(group, table_name, spanning) + + # ---- Combinaison finale ---- + df_eq_initial_spannings <- bind_rows( + df_eq_initial_spannings_no_group, + df_eq_initial_spannings_with_group + ) + }else{ + df_eq_initial_spannings <- df_eq_initial_spannings_no_group + } + + ############################################################################## # 'df_eq_indicator_spannings' defines the spanning information for equation indicators. # Each equation name is transformed into its uppercase form with a "^h" suffix, From ad5e7fc0961f8a8f215de55227072efbd7f31a33 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Wed, 17 Jun 2026 11:37:28 +0200 Subject: [PATCH 15/38] =?UTF-8?q?feat:=20suppression=20hrc=5Findicator=20p?= =?UTF-8?q?our=20les=20indicator=20pr=C3=A9sents=20dans=20les=20=C3=A9quat?= =?UTF-8?q?ions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/identify_hrc_with_eq.R | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index a088255..c65c968 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -169,17 +169,19 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # - 'spanning' is replaced by its uppercase hierarchical version if available, # - 'indicator' is replaced by its uppercase hierarchical version # (without the 'hrc_' prefix) if available and 'indicator' not part of 'df_eq_indicator' - browser() indic_not_in_eq <- setdiff(unique(df_metadata_long$indicator),unique(equations_long$var)) df_spannings <- df_metadata_long %>% - mutate(spanning_old = spanning) %>% - mutate(spanning = ifelse(is.na(hrc_spanning), + mutate(spanning_old = spanning, + spanning = ifelse(is.na(hrc_spanning), spanning, - toupper(hrc_spanning))) %>% - mutate(indicator = ifelse(indicator %in% indic_not_in_eq & !is.na(hrc_indicator), - toupper(sub("hrc_","",hrc_indicator)), - indicator)) + toupper(hrc_spanning)), + indicator = ifelse(indicator %in% indic_not_in_eq & !is.na(hrc_indicator), + toupper(sub("hrc_","",hrc_indicator)), + indicator), + hrc_indicator = ifelse(indicator %in% unique(equations_long$var), + NA, + hrc_indicator)) # 'df_variable_info' is a reference table linking original spanning names ('spanning_old') # to their transformed counterparts ('spanning'), along with the corresponding table name. @@ -197,26 +199,11 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ mutate(across(dplyr::where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% left_join(equations_long_full, by = c("indicator" = "var")) - ############################################################################## # Séparer les lignes avec et sans group df_with_group <- df_spannings_eq %>% filter(!is.na(group)) df_without_group <- df_spannings_eq %>% filter(is.na(group)) - # ---- Traitement des lignes SANS group (ancien code) ---- - df_eq_initial_spannings_no_group <- df_without_group %>% - filter(!is.na(eq_name)) %>% - group_by(group) %>% - dplyr::reframe( - table_name = paste(unique(table_name), collapse = "."), - field = last(field), - hrc_field = last(hrc_field), - spanning = spanning, - hrc_spanning = hrc_spanning, - indicator = last(unit), - hrc_indicator = last(hrc_indicator) - ) %>% unique() - # ---- Traitement des lignes AVEC group (nouveau code) ---- if(nrow(df_with_group) > 0){ spanning_by_table <- df_with_group %>% From d831ab0ba77f933729cd5241fd1ec58e340e283a Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Wed, 17 Jun 2026 17:48:19 +0200 Subject: [PATCH 16/38] =?UTF-8?q?feat:=20gestion=20des=20diff=C3=A9rents?= =?UTF-8?q?=20cas=20de=20variables=20de=20croisements=20communes=20ou=20no?= =?UTF-8?q?n=20aux=20inidcatuers=20des=20=C3=A9quations?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/identify_hrc_with_eq.R | 204 +++++++++++++++++++++++++++------------ 1 file changed, 142 insertions(+), 62 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index c65c968..0a9205f 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -204,67 +204,107 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ df_with_group <- df_spannings_eq %>% filter(!is.na(group)) df_without_group <- df_spannings_eq %>% filter(is.na(group)) - # ---- Traitement des lignes AVEC group (nouveau code) ---- - if(nrow(df_with_group) > 0){ - spanning_by_table <- df_with_group %>% - distinct(table_name, spanning, group) - - common_spannings <- spanning_by_table %>% - group_by(group, spanning) %>% - summarise( - tables_avec_ce_spanning = list(sort(unique(table_name))), - n_tables = n(), - .groups = "drop" - ) - - n_tables_par_group <- df_with_group %>% - group_by(group) %>% - summarise(n_total = n_distinct(table_name), .groups = "drop") - - common_spannings <- common_spannings %>% - left_join(n_tables_par_group, by = "group") - - spanning_all <- common_spannings %>% - filter(n_tables == n_total) - - spanning_extra <- common_spannings %>% - filter(n_tables < n_total) %>% - tidyr::unnest(tables_avec_ce_spanning) %>% - rename(table_name = tables_avec_ce_spanning) - - df_groupe <- df_with_group %>% - group_by(across(-c(table_name, side, var_mapped, indicator))) %>% - summarise( - table_name = paste(sort(unique(table_name)), collapse = "."), - indicator = last(unit), - .groups = "drop" - ) %>% - filter(spanning %in% spanning_all$spanning) - - tables_extra <- spanning_extra %>% - distinct(table_name, group) - - df_solo <- df_with_group %>% - semi_join(tables_extra, by = c("table_name", "group")) %>% - mutate( - initial_indicator = var_mapped[side == "total"][1], - indicator = unit - ) - - df_eq_initial_spannings_with_group <- bind_rows( - df_groupe, - df_solo %>% select(-c(side, var_mapped)) - ) %>% - arrange(group, table_name, spanning) - - # ---- Combinaison finale ---- - df_eq_initial_spannings <- bind_rows( - df_eq_initial_spannings_no_group, - df_eq_initial_spannings_with_group - ) - }else{ - df_eq_initial_spannings <- df_eq_initial_spannings_no_group - } + spanning_combination_group <- df_with_group |> + group_by(group, table_name) |> + summarise( + spanning = list(sort(unique(spanning))), + side = first(side), + .groups = "drop" + ) |> + group_by(group) |> + mutate( + all_sides = list(sort(unique(side))), + spanning_key = purrr::map_chr(spanning, paste, collapse = "|") + ) |> + ungroup() + + # Pour chaque combinaison unique, les sides couverts sont ceux des tables + # dont le spanning_set est un sur-ensemble de la combinaison + spanning_combination_group <- spanning_combination_group |> + distinct(group, spanning_key, spanning, all_sides) |> + group_by(group, spanning_key) |> + summarise( + spanning = list(spanning[[1]]), + all_sides = list(all_sides[[1]]), + .groups = "drop" + ) |> + mutate( + # Pour chaque combinaison, chercher tous les sides des tables + # dont le spanning_set contient cette combinaison + sides_couverts = purrr::map2(spanning, group, function(span_set, grp) { + spanning_combination_group |> + filter(group == grp) |> + filter(purrr::map_lgl(spanning, ~ all(span_set %in% .x))) |> + pull(side) |> + sort() |> + unique() + }), + sides_manquants = purrr::map2(all_sides, sides_couverts, setdiff), + combinaison_complete = purrr::map_lgl(sides_manquants, ~ length(.x) == 0) + ) |> + unnest_wider(spanning, names_sep = "_") + + list_groups <- split(df_with_group, df_with_group$group) + + df_eq_initial_spannings <- purrr::imap(list_groups, function(df_group, group_nb) { + x <- spanning_combination_group |> dplyr::filter(group == group_nb) |> + dplyr::pull(combinaison_complete) + + if (all(x)) { + regroup_tables(df_group, spanning_combination_group) + } else if (all(!x)) { + df_group |> + dplyr::mutate( + initial_indicator = var_mapped[side == "total"][1], + indicator = unit + ) |> + dplyr::select(table_name, field, hrc_field, indicator, hrc_indicator, + spanning, hrc_spanning, eq_name, unit, group, initial_indicator) + } else if (!all(x)) { + # Combinaisons incomplètes : certaines combis couvrent tous les sides, d'autres non + # On récupère les spanning_keys complètes depuis spanning_combination_group + span_comb <- spanning_combination_group |> filter(group == as.integer(group_nb)) + + complete_span_keys <- span_comb |> filter(combinaison_complete) |> pull(spanning_key) + incomplete_span_keys <- span_comb |> filter(!combinaison_complete) |> pull(spanning_key) + + # Tables dont le spanning_key est complet -> regrouper + spanning_by_table <- df_group |> + group_by(table_name) |> + summarise(spanning_key = paste(sort(unique(spanning)), collapse = "|"), .groups = "drop") + + tables_complete <- spanning_by_table |> filter(spanning_key %in% complete_span_keys) |> pull(table_name) + tables_incomplete <- spanning_by_table |> filter(spanning_key %in% incomplete_span_keys) |> pull(table_name) + + # Tables avec combinaison complète -> fusionner + df_merged <- if (length(tables_complete) > 0) { + df_group |> + filter(table_name %in% tables_complete) |> + group_by(across(-c(table_name, side, var_mapped, indicator))) |> + summarise( + table_name = paste(sort(unique(table_name)), collapse = "."), + indicator = last(unit), + initial_indicator = var_mapped[side == "total"][1], + .groups = "drop" + ) + } + + # Tables avec combinaison incomplète -> garder seules + df_solo <- if (length(tables_incomplete) > 0) { + df_group |> + filter(table_name %in% tables_incomplete) |> + mutate( + initial_indicator = var_mapped[side == "total"][1], + indicator = unit + ) |> + select(-c(side, var_mapped)) + } + + bind_rows(df_merged, df_solo) |> arrange(group, table_name, spanning) + } + }) |> + purrr::compact() |> + dplyr::bind_rows() ############################################################################## @@ -275,7 +315,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ filter(!is.na(eq_name)) %>% group_by(group) %>% summarise( - table_name = paste(unique(table_name), collapse = "."), + table_name = paste(sort(unique(table_name)), collapse = "."), field = last(field), hrc_field = last(hrc_field), spanning = if(length(unique(eq_name)) > 1) { @@ -332,3 +372,43 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ return(list_hrc_identified) } } + +regroup_tables <- function(df_group, spanning_combination_group) { + + current_group <- unique(df_group$group) + + # Récupérer les spanning_keys du groupe courant + span_comb <- spanning_combination_group |> filter(group == current_group) + + # Spanning_key par table + spanning_by_table <- df_group |> + group_by(table_name) |> + summarise( + spanning_key = paste(sort(unique(spanning)), collapse = "|"), + .groups = "drop" + ) + + # Grouper les tables par spanning_key identique + table_clusters <- spanning_by_table |> + group_by(spanning_key) |> + summarise(tables = list(sort(unique(table_name))), .groups = "drop") + + # Pour chaque cluster, fusionner les tables + purrr::map_dfr(seq_len(nrow(table_clusters)), function(i) { + cluster_tables <- table_clusters$tables[[i]] + cluster_span_key <- table_clusters$spanning_key[[i]] + + df_group |> + filter(table_name %in% cluster_tables) |> + group_by(across(-c(table_name, side, var_mapped, indicator))) |> + summarise( + table_name = paste(sort(unique(table_name)), collapse = "."), + indicator = last(unit), + initial_indicator = var_mapped[side == "total"][1], + .groups = "drop" + ) |> + unique() |> + select(table_name,field,hrc_field,indicator,everything()) + }) +} + From f424d876ea26e25feb4bc715574283638b705028 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Wed, 17 Jun 2026 17:49:49 +0200 Subject: [PATCH 17/38] fix: some examples had the same table twice, TO DO come back later to understand why there was a duplicate in the first place --- R/tab_to_treat.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/tab_to_treat.R b/R/tab_to_treat.R index 0d34bd6..92ea087 100644 --- a/R/tab_to_treat.R +++ b/R/tab_to_treat.R @@ -161,7 +161,8 @@ dataframe_result <- function(list_independent_tables, list_hrc_identified) { all_of(names(.)[grepl("^hrc_spanning_\\d+$", names(.))] %>% .[order(as.numeric(sub("hrc_spanning_", "", .)))]) ) %>% - as.data.frame() + as.data.frame() %>% + unique() # TODO come back to this, why where there duplicates in the first place return(dataframe_metadata) } From 21307fb3cb612c2684d53e07ec44147e2e79049c Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Wed, 17 Jun 2026 18:05:14 +0200 Subject: [PATCH 18/38] feat: treat cases of spanning variables for equaitons in one function --- R/identify_hrc_with_eq.R | 108 +++++++++++---------------------------- 1 file changed, 31 insertions(+), 77 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 0a9205f..c86a93a 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -246,62 +246,8 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ list_groups <- split(df_with_group, df_with_group$group) - df_eq_initial_spannings <- purrr::imap(list_groups, function(df_group, group_nb) { - x <- spanning_combination_group |> dplyr::filter(group == group_nb) |> - dplyr::pull(combinaison_complete) - - if (all(x)) { - regroup_tables(df_group, spanning_combination_group) - } else if (all(!x)) { - df_group |> - dplyr::mutate( - initial_indicator = var_mapped[side == "total"][1], - indicator = unit - ) |> - dplyr::select(table_name, field, hrc_field, indicator, hrc_indicator, - spanning, hrc_spanning, eq_name, unit, group, initial_indicator) - } else if (!all(x)) { - # Combinaisons incomplètes : certaines combis couvrent tous les sides, d'autres non - # On récupère les spanning_keys complètes depuis spanning_combination_group - span_comb <- spanning_combination_group |> filter(group == as.integer(group_nb)) - - complete_span_keys <- span_comb |> filter(combinaison_complete) |> pull(spanning_key) - incomplete_span_keys <- span_comb |> filter(!combinaison_complete) |> pull(spanning_key) - - # Tables dont le spanning_key est complet -> regrouper - spanning_by_table <- df_group |> - group_by(table_name) |> - summarise(spanning_key = paste(sort(unique(spanning)), collapse = "|"), .groups = "drop") - - tables_complete <- spanning_by_table |> filter(spanning_key %in% complete_span_keys) |> pull(table_name) - tables_incomplete <- spanning_by_table |> filter(spanning_key %in% incomplete_span_keys) |> pull(table_name) - - # Tables avec combinaison complète -> fusionner - df_merged <- if (length(tables_complete) > 0) { - df_group |> - filter(table_name %in% tables_complete) |> - group_by(across(-c(table_name, side, var_mapped, indicator))) |> - summarise( - table_name = paste(sort(unique(table_name)), collapse = "."), - indicator = last(unit), - initial_indicator = var_mapped[side == "total"][1], - .groups = "drop" - ) - } - - # Tables avec combinaison incomplète -> garder seules - df_solo <- if (length(tables_incomplete) > 0) { - df_group |> - filter(table_name %in% tables_incomplete) |> - mutate( - initial_indicator = var_mapped[side == "total"][1], - indicator = unit - ) |> - select(-c(side, var_mapped)) - } - - bind_rows(df_merged, df_solo) |> arrange(group, table_name, spanning) - } + df_eq_initial_spannings <- purrr::map(list_groups, function(df_group) { + regroup_tables(df_group, spanning_combination_group) }) |> purrr::compact() |> dplyr::bind_rows() @@ -374,41 +320,49 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ } regroup_tables <- function(df_group, spanning_combination_group) { - current_group <- unique(df_group$group) - # Récupérer les spanning_keys du groupe courant - span_comb <- spanning_combination_group |> filter(group == current_group) - - # Spanning_key par table + # spanning_key par table spanning_by_table <- df_group |> group_by(table_name) |> - summarise( - spanning_key = paste(sort(unique(spanning)), collapse = "|"), - .groups = "drop" - ) + summarise(spanning_key = paste(sort(unique(spanning)), collapse = "|"), .groups = "drop") + + # Récupérer le statut complet/incomplet par spanning_key + span_comb <- spanning_combination_group |> + filter(group == current_group) |> + select(spanning_key, combinaison_complete) - # Grouper les tables par spanning_key identique - table_clusters <- spanning_by_table |> - group_by(spanning_key) |> - summarise(tables = list(sort(unique(table_name))), .groups = "drop") + spanning_by_table <- spanning_by_table |> left_join(span_comb, by = "spanning_key") - # Pour chaque cluster, fusionner les tables - purrr::map_dfr(seq_len(nrow(table_clusters)), function(i) { - cluster_tables <- table_clusters$tables[[i]] - cluster_span_key <- table_clusters$spanning_key[[i]] + tables_complete <- spanning_by_table |> filter(combinaison_complete) |> pull(table_name) + tables_incomplete <- spanning_by_table |> filter(!combinaison_complete) |> pull(table_name) + # Tables complètes -> fusionner par spanning_key identique + df_merged <- if (length(tables_complete) > 0) { df_group |> - filter(table_name %in% cluster_tables) |> + filter(table_name %in% tables_complete) |> group_by(across(-c(table_name, side, var_mapped, indicator))) |> summarise( table_name = paste(sort(unique(table_name)), collapse = "."), indicator = last(unit), initial_indicator = var_mapped[side == "total"][1], .groups = "drop" + ) + } + + # Tables incomplètes -> garder seules + df_solo <- if (length(tables_incomplete) > 0) { + df_group |> + filter(table_name %in% tables_incomplete) |> + mutate( + initial_indicator = var_mapped[side == "total"][1], + indicator = unit ) |> - unique() |> - select(table_name,field,hrc_field,indicator,everything()) - }) + select(-c(side, var_mapped)) + } + + bind_rows(df_merged, df_solo) |> + arrange(table_name, spanning) |> + select(table_name, field, hrc_field, indicator, everything()) } From ab96a3e0c6bceea0725f292ece1d0c79423bc573 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Tue, 23 Jun 2026 12:11:48 +0200 Subject: [PATCH 19/38] fix: use function from dplyr --- R/tab_to_treat.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/tab_to_treat.R b/R/tab_to_treat.R index 92ea087..bc41fdd 100644 --- a/R/tab_to_treat.R +++ b/R/tab_to_treat.R @@ -134,7 +134,7 @@ dataframe_result <- function(list_independent_tables, list_hrc_identified) { dataframe_metadata <- purrr::imap_dfr(list_independent_tables, function(tibble, tibble_name) { tibble %>% mutate(cluster = tibble_name) }) - + # browser() # If the initial_indicator column exists in list_hrc_identified, # replace indicator with initial_indicator whenever initial_indicator is not NA if ("initial_indicator" %in% names(list_hrc_identified[[1]])) { @@ -146,7 +146,7 @@ dataframe_result <- function(list_independent_tables, list_hrc_identified) { dataframe_metadata <- dataframe_metadata %>% left_join(hrc_indicator_map, by = c("table_name", "field", "indicator")) %>% - mutate(indicator = if_else(!is.na(initial_indicator), initial_indicator, indicator)) %>% + mutate(indicator = dplyr::if_else(!is.na(initial_indicator), initial_indicator, indicator)) %>% select(-initial_indicator) } From 5bd98b99ddb690a514ab1eeebafbb098edadae39 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Tue, 23 Jun 2026 12:12:33 +0200 Subject: [PATCH 20/38] fix: name hrc_spanning with totcode being total of equation --- R/identify_hrc_with_eq.R | 75 ++++++++++++++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 10 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index c86a93a..47e323f 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -240,7 +240,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ unique() }), sides_manquants = purrr::map2(all_sides, sides_couverts, setdiff), - combinaison_complete = purrr::map_lgl(sides_manquants, ~ length(.x) == 0) + all_combinations = purrr::map_lgl(sides_manquants, ~ length(.x) == 0) ) |> unnest_wider(spanning, names_sep = "_") @@ -253,15 +253,27 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ dplyr::bind_rows() ############################################################################## + table_group_mapping <- df_eq_initial_spannings %>% + # On éclate le table_name combiné pour retrouver les tables individuelles + mutate(table_name_combined = table_name) %>% + tidyr::separate_rows(table_name, sep = "\\.") %>% + select(table_name, table_name_combined, group) + + totcode_equation <- df_spannings_eq %>% + filter(side == "total") %>% + group_by(group) %>% + summarise(totcode = first(var_mapped), .groups = "drop") # 'df_eq_indicator_spannings' defines the spanning information for equation indicators. # Each equation name is transformed into its uppercase form with a "^h" suffix, # and its hierarchical version prefixed with "hrc_". df_eq_indicator_spannings <- df_spannings_eq %>% filter(!is.na(eq_name)) %>% - group_by(group) %>% + left_join(table_group_mapping, by = c("table_name", "group")) %>% + left_join(totcode_equation, by = "group") %>% + group_by(group, table_name_combined) %>% summarise( - table_name = paste(sort(unique(table_name)), collapse = "."), + table_name = first(table_name_combined), field = last(field), hrc_field = last(hrc_field), spanning = if(length(unique(eq_name)) > 1) { @@ -270,15 +282,15 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ paste0(toupper(last(eq_name)), "^h") }, hrc_spanning = if(length(unique(eq_name)) > 1) { - paste0("hrc_", paste0(unique(toupper(eq_name)), collapse = "_"),".totcode.",var_mapped[side == "total"][1]) + paste0("hrc_", paste0(unique(toupper(eq_name)), collapse = "_"),".totcode.",first(totcode)) } else { - paste0("hrc_", toupper(last(eq_name)),".totcode.",var_mapped[side == "total"][1]) + paste0("hrc_", toupper(last(eq_name)),".totcode.",first(totcode)) }, indicator = last(unit), hrc_indicator = last(hrc_indicator), .groups = "drop" ) %>% - dplyr::distinct(group, spanning, hrc_spanning, .keep_all = TRUE) + dplyr::distinct(group, table_name, spanning, hrc_spanning, .keep_all = TRUE) # 'df_indicators' combines both initial and indicator spanning information # into a single harmonized dataset, keeping key structural columns @@ -319,6 +331,47 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ } } +#' Regroup tables within a group (i.e. equation / group of linked equations) +#' based on spanning combination completeness +#' +#' @description +#' For a given group of tables, this function identifies which tables cover all +#' sides of an equation (total, rhs1, rhs2, ...) for their spanning combination, +#' and which do not. Tables with complete combinations are merged into a single +#' row; tables with incomplete combinations are kept as standalone rows with +#' their original spannings. +#' +#' @param df_group A tibble containing the rows of a single group from +#' \code{df_with_group}. Must contain columns: \code{table_name}, +#' \code{spanning}, \code{side}, \code{var_mapped}, \code{indicator}, +#' \code{unit}, and \code{group}. +#' @param spanning_combination_group A tibble produced by the +#' \code{spanning_combination_group} pipeline, containing one row per +#' (group, spanning_key) combination. Must contain columns: \code{group}, +#' \code{spanning_key}, and \code{all_combinations} (logical indicating +#' whether the spanning combination covers all sides of the equation). +#' +#' @return A tibble with one row per (merged or solo) table cluster and +#' spanning, containing the following columns (among others): +#' \describe{ +#' \item{table_name}{Dot-separated list of merged table names (e.g. +#' \code{"T7.T9.T11"}) for complete combinations, or the original +#' table name for incomplete ones.} +#' \item{indicator}{The unit value shared across the merged tables.} +#' \item{initial_indicator}{The \code{var_mapped} value of the \code{total} +#' side, used to track the original indicator before merging.} +#' } +#' +#' @examples +#' \dontrun{ +#' list_groups <- split(df_with_group, df_with_group$group) +#' +#' df_eq_initial_spannings <- purrr::map(list_groups, function(df_group) { +#' regroup_tables(df_group, spanning_combination_group) +#' }) |> +#' purrr::compact() |> +#' dplyr::bind_rows() +#' } regroup_tables <- function(df_group, spanning_combination_group) { current_group <- unique(df_group$group) @@ -330,24 +383,26 @@ regroup_tables <- function(df_group, spanning_combination_group) { # Récupérer le statut complet/incomplet par spanning_key span_comb <- spanning_combination_group |> filter(group == current_group) |> - select(spanning_key, combinaison_complete) + select(spanning_key, all_combinations) spanning_by_table <- spanning_by_table |> left_join(span_comb, by = "spanning_key") - tables_complete <- spanning_by_table |> filter(combinaison_complete) |> pull(table_name) - tables_incomplete <- spanning_by_table |> filter(!combinaison_complete) |> pull(table_name) + tables_complete <- spanning_by_table |> filter(all_combinations) |> pull(table_name) + tables_incomplete <- spanning_by_table |> filter(!all_combinations) |> pull(table_name) # Tables complètes -> fusionner par spanning_key identique df_merged <- if (length(tables_complete) > 0) { df_group |> filter(table_name %in% tables_complete) |> + left_join(spanning_by_table |> select(table_name, spanning_key), by = "table_name") |> group_by(across(-c(table_name, side, var_mapped, indicator))) |> summarise( table_name = paste(sort(unique(table_name)), collapse = "."), indicator = last(unit), initial_indicator = var_mapped[side == "total"][1], .groups = "drop" - ) + ) |> + select(-spanning_key) } # Tables incomplètes -> garder seules From b366b5e5e98475cc2754d4b3a848632fefe6e0ab Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Tue, 23 Jun 2026 14:55:13 +0200 Subject: [PATCH 21/38] refactor: one mutate() instead of two --- R/identify_hrc_with_eq.R | 105 +++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 49 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 47e323f..5dcefa0 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -62,10 +62,8 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # with each rhs term placed in a separate column. parsed_equations <- df_eq_indicator %>% tidyr::separate(eq_indicator, into = c("total", "rhs"), sep = "=", extra = "merge") %>% - dplyr::mutate(rhs = trimws(rhs)) %>% tidyr::separate_rows(rhs, sep = "\\+") %>% - dplyr::mutate(rhs = trimws(rhs), - total = trimws(total)) %>% + dplyr::mutate(across(c(total, rhs), trimws)) %>% dplyr::group_by(dplyr::across(-rhs)) %>% dplyr::mutate(term_number = paste0("rhs", dplyr::row_number())) %>% tidyr::pivot_wider(names_from = term_number, values_from = rhs) %>% @@ -200,10 +198,36 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ left_join(equations_long_full, by = c("indicator" = "var")) ############################################################################## - # Séparer les lignes avec et sans group df_with_group <- df_spannings_eq %>% filter(!is.na(group)) df_without_group <- df_spannings_eq %>% filter(is.na(group)) + # --- CAS 1 : aucune ligne avec group → pas d'équation à traiter --- + if (nrow(df_with_group) == 0) { + if (nrow(df_without_group) > 0) { + if (all(is.na(df_without_group$hrc_indicator))) { + return(list(df_without_group, df_variable_info)) + } else { + df_no_eq_indicators <- df_without_group %>% + filter(!is.na(hrc_indicator)) %>% + dplyr::group_by(table_name) %>% + summarise( + field = last(field), + hrc_field = last(hrc_field), + spanning = paste0(toupper(last(hrc_indicator)), "^h"), + hrc_spanning = last(hrc_indicator), + indicator = last(indicator), + hrc_indicator = last(hrc_indicator) + ) %>% + bind_rows(df_spannings, .) %>% + arrange(table_name) + return(list(df_no_eq_indicators, df_variable_info)) + } + } else { + return(list(df_spannings_eq, df_variable_info)) + } + } + + # --- CAS 2 : il y a des lignes avec group → traitement des équations --- spanning_combination_group <- df_with_group |> group_by(group, table_name) |> summarise( @@ -218,8 +242,6 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ ) |> ungroup() - # Pour chaque combinaison unique, les sides couverts sont ceux des tables - # dont le spanning_set est un sur-ensemble de la combinaison spanning_combination_group <- spanning_combination_group |> distinct(group, spanning_key, spanning, all_sides) |> group_by(group, spanning_key) |> @@ -229,8 +251,6 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ .groups = "drop" ) |> mutate( - # Pour chaque combinaison, chercher tous les sides des tables - # dont le spanning_set contient cette combinaison sides_couverts = purrr::map2(spanning, group, function(span_set, grp) { spanning_combination_group |> filter(group == grp) |> @@ -239,7 +259,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ sort() |> unique() }), - sides_manquants = purrr::map2(all_sides, sides_couverts, setdiff), + sides_manquants = purrr::map2(all_sides, sides_couverts, setdiff), all_combinations = purrr::map_lgl(sides_manquants, ~ length(.x) == 0) ) |> unnest_wider(spanning, names_sep = "_") @@ -252,82 +272,69 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ purrr::compact() |> dplyr::bind_rows() - ############################################################################## table_group_mapping <- df_eq_initial_spannings %>% - # On éclate le table_name combiné pour retrouver les tables individuelles mutate(table_name_combined = table_name) %>% tidyr::separate_rows(table_name, sep = "\\.") %>% select(table_name, table_name_combined, group) - totcode_equation <- df_spannings_eq %>% + totcode_equation <- df_with_group %>% # <-- df_with_group, pas df_spannings_eq filter(side == "total") %>% group_by(group) %>% summarise(totcode = first(var_mapped), .groups = "drop") - # 'df_eq_indicator_spannings' defines the spanning information for equation indicators. - # Each equation name is transformed into its uppercase form with a "^h" suffix, - # and its hierarchical version prefixed with "hrc_". - df_eq_indicator_spannings <- df_spannings_eq %>% + df_eq_indicator_spannings <- df_with_group %>% # <-- df_with_group, pas df_spannings_eq filter(!is.na(eq_name)) %>% left_join(table_group_mapping, by = c("table_name", "group")) %>% left_join(totcode_equation, by = "group") %>% group_by(group, table_name_combined) %>% summarise( - table_name = first(table_name_combined), - field = last(field), - hrc_field = last(hrc_field), - spanning = if(length(unique(eq_name)) > 1) { + table_name = first(table_name_combined), + field = last(field), + hrc_field = last(hrc_field), + spanning = if (length(unique(eq_name)) > 1) { paste0(paste0(unique(toupper(eq_name)), collapse = "_"), "^h") } else { paste0(toupper(last(eq_name)), "^h") }, - hrc_spanning = if(length(unique(eq_name)) > 1) { - paste0("hrc_", paste0(unique(toupper(eq_name)), collapse = "_"),".totcode.",first(totcode)) + hrc_spanning = if (length(unique(eq_name)) > 1) { + paste0("hrc_", paste0(unique(toupper(eq_name)), collapse = "_"), ".totcode.", first(totcode)) } else { - paste0("hrc_", toupper(last(eq_name)),".totcode.",first(totcode)) + paste0("hrc_", toupper(last(eq_name)), ".totcode.", first(totcode)) }, - indicator = last(unit), + indicator = last(unit), hrc_indicator = last(hrc_indicator), - .groups = "drop" + .groups = "drop" ) %>% dplyr::distinct(group, table_name, spanning, hrc_spanning, .keep_all = TRUE) - # 'df_indicators' combines both initial and indicator spanning information - # into a single harmonized dataset, keeping key structural columns - # and sorting rows by table name. - df_indicators <- bind_rows(df_eq_initial_spannings,df_eq_indicator_spannings) %>% - select(table_name,field,hrc_field,indicator,hrc_indicator,everything()) %>% + df_indicators <- bind_rows(df_eq_initial_spannings, df_eq_indicator_spannings) %>% + select(table_name, field, hrc_field, indicator, hrc_indicator, everything()) %>% arrange(table_name) - # 'df_no_eq_spannings' contains all spanning rows - # that are not associated with any equation (eq_name is missing). - df_no_eq_spannings <- df_spannings_eq %>% filter(is.na(eq_name)) - - if(nrow(df_no_eq_spannings) > 0){ - if(all(is.na(df_no_eq_spannings$hrc_indicator))){ - df_indicators <- bind_rows(df_indicators,df_no_eq_spannings) %>% arrange(table_name) - return(list(df_indicators,df_variable_info)) + # Lignes sans group (sans équation) — df_without_group réutilisé ici + if (nrow(df_without_group) > 0) { + if (all(is.na(df_without_group$hrc_indicator))) { + df_indicators <- bind_rows(df_indicators, df_without_group) %>% arrange(table_name) + return(list(df_indicators, df_variable_info)) } else { - df_no_eq_indicators <- df_no_eq_spannings %>% + df_no_eq_indicators <- df_without_group %>% filter(!is.na(hrc_indicator)) %>% dplyr::group_by(table_name) %>% summarise( - field = last(field), - hrc_field = last(hrc_field), - spanning = paste0(toupper(last(hrc_indicator)),"^h"), - hrc_spanning = last(hrc_indicator), - indicator = last(indicator), + field = last(field), + hrc_field = last(hrc_field), + spanning = paste0(toupper(last(hrc_indicator)), "^h"), + hrc_spanning = last(hrc_indicator), + indicator = last(indicator), hrc_indicator = last(hrc_indicator) ) %>% bind_rows(df_spannings, .) %>% arrange(table_name) - df_indicators <- bind_rows(df_indicators,df_no_eq_indicators) %>% arrange(table_name) - list_hrc_identified = list(df_indicators,df_variable_info) - return(list_hrc_identified) + df_indicators <- bind_rows(df_indicators, df_no_eq_indicators) %>% arrange(table_name) + return(list(df_indicators, df_variable_info)) } } else { - list_hrc_identified = list(df_indicators,df_variable_info) - return(list_hrc_identified) + return(list(df_indicators, df_variable_info)) } } From 02aff21896800cf70c5a83bbd35916966d8d53a1 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Tue, 23 Jun 2026 15:37:01 +0200 Subject: [PATCH 22/38] test: 6 examples to check the analysis of links between indicators --- tests/testthat/test_analyse_metadata.R | 209 +++++++++++++++++++++++-- 1 file changed, 198 insertions(+), 11 deletions(-) diff --git a/tests/testthat/test_analyse_metadata.R b/tests/testthat/test_analyse_metadata.R index 75340a8..33b8cdb 100644 --- a/tests/testthat/test_analyse_metadata.R +++ b/tests/testthat/test_analyse_metadata.R @@ -162,10 +162,25 @@ test_that("hierarchies on indicators", { ##################################################### INDICATOR EQUATIONS CHECKS # all the spanning variables are taken into account when using equations on # indicators ------------------------------------------------------------------- +df_eq_lettuce_1 <- data.frame( + eq_name = c("eq1"), + eq_indicator = c("to_lettuce = to_batavia + to_arugula"), + unit = c("EUR"), + stringsAsFactors = FALSE +) + +df_eq_lettuce_2 <- data.frame( + eq_name = c("eq1","eq2"), + eq_indicator = c("to_lettuce = to_batavia + to_arugula", + "to_pizza = to_tomates + to_pate"), + unit = c("EUR","EUR"), + stringsAsFactors = FALSE +) + answer <- data.frame( cluster = c( - "france_entreprises_2023.hrc_lettuce", - "france_entreprises_2023.hrc_lettuce", + "france_entreprises_2023.EUR", + "france_entreprises_2023.EUR", "france_entreprises_2023.to_pizza", "france_entreprises_2023.to_pizza" ), @@ -176,26 +191,198 @@ answer <- data.frame( "T3.T4.T5.T6" ), field = rep("france_entreprises_2023", 4), - indicator = c("LETTUCE", "LETTUCE", "to_pizza", "to_pizza"), + indicator = c("to_lettuce", "to_lettuce", "to_pizza", "to_pizza"), spanning_1 = c("HRC_NAF", "HRC_NAF", "HRC_NUTS", "HRC_NAF"), spanning_2 = c("cj", "size", "size", "HRC_NUTS"), - spanning_3 = c("HRC_LETTUCE^h", "HRC_LETTUCE^h", NA, NA), + spanning_3 = c("EQ1^h", "EQ1^h", NA, NA), hrc_spanning_1 = c("hrc_naf", "hrc_naf", "hrc_nuts", "hrc_naf"), hrc_spanning_2 = c(NA, NA, NA, "hrc_nuts"), - hrc_spanning_3 = c("hrc_lettuce", "hrc_lettuce", NA, NA) + hrc_spanning_3 = c("hrc_EQ1.totcode.to_lettuce", "hrc_EQ1.totcode.to_lettuce", NA, NA) ) test_that("indicators equation", { - df_eq_ex <- data.frame( - eq_name = c("eq1"), - eq_indicator = c("to_lettuce = to_batavia + to_arugula"), - unit = c("EUR"), - stringsAsFactors = FALSE + expect_warning( + expect_equal( + analyse_metadata(df_metadata = metadata_pizza_lettuce,df_eq_indicator = df_eq_lettuce_1), + answer + ), + "hrc_indicator column will be ignored" + ) + +} +) + +# Nommer test 1 ---------------------------------------------------------------- +answer <- data.frame( + cluster = c("france_entreprises_2023.EUR"), + table_name = c("T11.T7.T9"), + field = c("france_entreprises_2023"), + indicator = c("to_lettuce"), + spanning_1 = c("a10"), + spanning_2 = c("EQ1^h"), + hrc_spanning_1 = NA_character_, + hrc_spanning_2 = c("hrc_EQ1.totcode.to_lettuce") +) + +test_that("meme_var_crois_1", { + meta <- metadata_pizza_lettuce[,c(1:7)] %>% filter(table_name %in% c("T7","T9","T11")) + meta$hrc_spanning_1 <- NA_character_ + + expect_warning( + expect_equal( + analyse_metadata(df_metadata = meta,df_eq_indicator = df_eq_lettuce_1), + answer + ), + "hrc_indicator column will be ignored" + ) + +} +) + +# Nommer test 2 ---------------------------------------------------------------- +answer <- data.frame( + cluster = c("france_entreprises_2023.EUR","france_entreprises_2023.EUR"), + table_name = c("T10.T12.T8","T11.T7.T9"), + field = c("france_entreprises_2023","france_entreprises_2023"), + indicator = c("to_lettuce","to_lettuce"), + spanning_1 = c("a10","a10"), + spanning_2 = c("cj","size"), + spanning_3 = c("EQ1^h","EQ1^h"), + hrc_spanning_1 = NA_character_, + hrc_spanning_2 = NA_character_, + hrc_spanning_3 = c("hrc_EQ1.totcode.to_lettuce","hrc_EQ1.totcode.to_lettuce") +) + +test_that("meme_var_crois_2", { + meta <- metadata_pizza_lettuce[c(7:12),] + meta$hrc_spanning_1 <- NA_character_ + + expect_warning( + expect_equal( + analyse_metadata(df_metadata = meta,df_eq_indicator = df_eq_lettuce_1), + answer + ), + "hrc_indicator column will be ignored" + ) + +} +) + +# Nommer test 3 ---------------------------------------------------------------- +answer <- data.frame( + cluster = c("france_entreprises_2023.EUR","france_entreprises_2023.EUR"), + table_name = c("T10.T11.T12.T7.T8.T9","T4.T5.T6"), + field = c("france_entreprises_2023","france_entreprises_2023"), + indicator = c("to_lettuce","to_pizza"), + spanning_1 = c("a10","a10"), + spanning_2 = c("EQ1^h","EQ2^h"), + hrc_spanning_1 = NA_character_, + hrc_spanning_2 = c("hrc_EQ1.totcode.to_lettuce","hrc_EQ2.totcode.to_pizza") +) + +test_that("meme_var_crois_1_deux_eq", { + meta <- metadata_pizza_lettuce[c(4:12),c(1:7)] + meta$indicator <- c("to_pizza","to_tomates","to_pate","to_batavia","to_batavia","to_arugula","to_arugula","to_lettuce","to_lettuce") + meta <- meta %>% mutate(spanning_1 = "a10",hrc_spanning_1 = NA_character_) + + expect_warning( + expect_equal( + analyse_metadata(df_metadata = meta,df_eq_indicator = df_eq_lettuce_2), + answer + ), + "hrc_indicator column will be ignored" ) +} +) + +# Nommer test 4 ---------------------------------------------------------------- +answer <- data.frame( + cluster = rep("france_entreprises_2023.EUR"), + table_name = c("T10.T12.T8","T11.T7.T9","T4.T5.T6"), + field = rep("france_entreprises_2023"), + indicator = c("to_lettuce","to_lettuce","to_pizza"), + spanning_1 = rep("a10"), + spanning_2 = c("cj","size","size"), + spanning_3 = c("EQ1^h","EQ1^h","EQ2^h"), + hrc_spanning_1 = NA_character_, + hrc_spanning_2 = NA_character_, + hrc_spanning_3 = c("hrc_EQ1.totcode.to_lettuce","hrc_EQ1.totcode.to_lettuce","hrc_EQ2.totcode.to_pizza") +) + +test_that("meme_var_crois_2_deux_eq", { + meta <- metadata_pizza_lettuce[c(4:12),] + meta$indicator <- c("to_pizza","to_tomates","to_pate","to_batavia","to_batavia","to_arugula","to_arugula","to_lettuce","to_lettuce") + meta <- meta %>% mutate(spanning_1 = "a10", + hrc_spanning_1 = NA_character_, + spanning_2 = c("size","size","size","size","cj","size","cj","size","cj"), + hrc_spanning_2 = NA_character_) + + expect_warning( + expect_equal( + analyse_metadata(df_metadata = meta,df_eq_indicator = df_eq_lettuce_2), + answer + ), + "hrc_indicator column will be ignored" + ) + +} +) + +# Nommer test 5 ---------------------------------------------------------------- +answer <- data.frame( + cluster = rep("france_entreprises_2023.EUR"), + table_name = c("T11","T7.T9"), + field = rep("france_entreprises_2023"), + indicator = c("to_lettuce","EUR"), + spanning_1 = c("cj","a10"), + spanning_2 = c("EQ1^h","EQ1^h"), + hrc_spanning_1 = NA_character_, + hrc_spanning_2 = c("hrc_EQ1.totcode.to_lettuce","hrc_EQ1.totcode.to_lettuce") +) + + +test_that("pas_meme_var_crois_1", { + meta <- metadata_pizza_lettuce[,c(1:7)] %>% + filter(table_name %in% c("T7","T9","T11")) %>% + mutate(spanning_1 = c("a10","a10","cj")) + meta$hrc_spanning_1 <- NA_character_ + + expect_warning( + expect_equal( + analyse_metadata(df_metadata = meta,df_eq_indicator = df_eq_lettuce_1), + answer + ), + "hrc_indicator column will be ignored" + ) + +} +) + +# Nommer test 6 ---------------------------------------------------------------- +answer <- data.frame( + cluster = rep("france_entreprises_2023.EUR"), + table_name = c("T1.T2","T1.T2.T3"), + field = rep("france_entreprises_2023"), + indicator = c("EUR","EUR"), + spanning_1 = c("a10","a10"), + spanning_2 = c("EQ1^h","size"), + spanning_3 = c(NA,"EQ1^h"), + hrc_spanning_1 = NA_character_, + hrc_spanning_2 = c("hrc_EQ1.totcode.to_lettuce",NA), + hrc_spanning_3 = c(NA,"hrc_EQ1.totcode.to_lettuce") +) + +test_that("pas_meme_var_crois_2", { + meta <- metadata_pizza_lettuce %>% filter(table_name %in% c("T7","T9","T11")) + meta$spanning_2 <- c(NA,NA,"size") + meta$hrc_spanning_1 <- NA_character_ + meta$hrc_indicator <- NA_character_ + meta$table_name <- c("T1","T2","T3") + expect_warning( expect_equal( - analyse_metadata(df_metadata = metadata_pizza_lettuce,df_eq_indicator = df_eq_ex), + analyse_metadata(df_metadata = meta,df_eq_indicator = df_eq_lettuce_1), answer ), "hrc_indicator column will be ignored" From a320bae9d36f9bdd343abb43e815ea68a0d4fff7 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Tue, 23 Jun 2026 15:39:29 +0200 Subject: [PATCH 23/38] refactor: delete links because links_full is sufficient --- R/identify_hrc_with_eq.R | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 5dcefa0..0920a32 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -70,22 +70,6 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ dplyr::ungroup() %>% dplyr::select(eq_name, unit, total, everything()) - # Identify chained equations (A = B + C, B = D + E) and group equations together - - # Build dependency links between totals and rhs - links <- parsed_equations %>% - tidyr::pivot_longer( - cols = starts_with("rhs"), - names_to = "rhs_term", - values_to = "rhs" - ) %>% - dplyr::filter(!is.na(rhs)) %>% - dplyr::mutate( - total = trimws(as.character(total)), - rhs = trimws(as.character(rhs)) - ) %>% - dplyr::distinct() - # Identify ambiguous totals total_counts <- parsed_equations %>% dplyr::count(total, name = "n_total") ambiguous_totals <- total_counts %>% dplyr::filter(n_total > 1) %>% pull(total) @@ -107,11 +91,22 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ dplyr::ungroup() %>% dplyr::select(eq_name, total, total_alt) - # Apply the mapping to the links - # 'links' contains total, rhs, eq_name (if not, it must be joined beforehand) + # Identify chained equations (A = B + C, B = D + E) and group equations together + # and apply the mapping to the links # here we assume links has an eq_name column; otherwise do # left_join(links, parsed_equations %>% select(eq_name, total, rhs), ...) first - links_full <- links %>% + links_full <- parsed_equations %>% + tidyr::pivot_longer( + cols = starts_with("rhs"), + names_to = "rhs_term", + values_to = "rhs" + ) %>% + dplyr::filter(!is.na(rhs)) %>% + dplyr::mutate( + total = trimws(as.character(total)), + rhs = trimws(as.character(rhs)) + ) %>% + dplyr::distinct() %>% # replace total with its equation-specific alternative left_join(alt_map, by = c("eq_name", "total")) %>% mutate(total = dplyr::coalesce(total_alt, total)) %>% From 81ca0d629d6e342955b3c7a3c6dfcd97a10a828f Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Tue, 23 Jun 2026 15:40:51 +0200 Subject: [PATCH 24/38] refactor: delete ambiguous_totals --- R/identify_hrc_with_eq.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 0920a32..429b3e1 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -72,7 +72,6 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # Identify ambiguous totals total_counts <- parsed_equations %>% dplyr::count(total, name = "n_total") - ambiguous_totals <- total_counts %>% dplyr::filter(n_total > 1) %>% pull(total) # Build a total -> total_alt mapping by eq_name # For all equations (ambiguous or not), create one row; From 3e9ef85d722505e0658ed242a9f5f1030b0c674e Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Tue, 23 Jun 2026 16:15:11 +0200 Subject: [PATCH 25/38] refactor: more straight forward way to create df_variable_info --- R/identify_hrc_with_eq.R | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 429b3e1..e2a231f 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -163,35 +163,25 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # (without the 'hrc_' prefix) if available and 'indicator' not part of 'df_eq_indicator' indic_not_in_eq <- setdiff(unique(df_metadata_long$indicator),unique(equations_long$var)) + df_variable_info <- df_metadata_long %>% + mutate( + spanning_new = ifelse(is.na(hrc_spanning), spanning, toupper(hrc_spanning)) + ) %>% + distinct(var_start_name = spanning, var_end_name = spanning_new, table_name) + df_spannings <- df_metadata_long %>% - mutate(spanning_old = spanning, - spanning = ifelse(is.na(hrc_spanning), - spanning, - toupper(hrc_spanning)), - indicator = ifelse(indicator %in% indic_not_in_eq & !is.na(hrc_indicator), - toupper(sub("hrc_","",hrc_indicator)), - indicator), - hrc_indicator = ifelse(indicator %in% unique(equations_long$var), - NA, - hrc_indicator)) - - # 'df_variable_info' is a reference table linking original spanning names ('spanning_old') - # to their transformed counterparts ('spanning'), along with the corresponding table name. - df_variable_info <- data.frame( - var_start_name = df_spannings$spanning_old, - var_end_name = df_spannings$spanning, - table_name = df_spannings$table_name - ) %>% unique() - - # Update 'df_spannings' by removing the temporary 'spanning_old' column. - df_spannings <- df_spannings %>% select(-spanning_old) + mutate( + spanning = ifelse(is.na(hrc_spanning), spanning, toupper(hrc_spanning)), + indicator = ifelse(indicator %in% indic_not_in_eq & !is.na(hrc_indicator), + toupper(sub("hrc_", "", hrc_indicator)), indicator), + hrc_indicator = ifelse(indicator %in% unique(equations_long$var), NA, hrc_indicator) + ) df_spannings_eq <- df_spannings %>% # delete all the non-word elements, specifically for the white spaces mutate(across(dplyr::where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% left_join(equations_long_full, by = c("indicator" = "var")) - ############################################################################## df_with_group <- df_spannings_eq %>% filter(!is.na(group)) df_without_group <- df_spannings_eq %>% filter(is.na(group)) From 0ee16ee6cb02327eb45bbe6074744717db5b70b6 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Thu, 25 Jun 2026 15:06:33 +0200 Subject: [PATCH 26/38] feat: build_spanning_based_on_hrc_indicator() + warning when incoherence between df_metadata and df_eq_indicator --- R/identify_hrc_with_eq.R | 133 ++++++++++++++++++++++----------------- 1 file changed, 74 insertions(+), 59 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index e2a231f..9a6f58a 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -47,10 +47,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # check that the input is in the right format: right column names check_column_names <- function(df) { - # Expected fixed column names fixed_columns <- c("eq_name", "eq_indicator", "unit") - - # Check that the fixed columns exist if (!all(fixed_columns %in% names(df))) { stop("Error: The dataframe describing the equations between indicators is missing one or more required columns: eq_name, eq_indicator, unit.") @@ -106,15 +103,9 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ rhs = trimws(as.character(rhs)) ) %>% dplyr::distinct() %>% - # replace total with its equation-specific alternative left_join(alt_map, by = c("eq_name", "total")) %>% mutate(total = dplyr::coalesce(total_alt, total)) %>% select(-total_alt) %>% - # now replace rhs if it exists as a "total" in alt_map: - # we must choose the correct total_alt for rhs according to the equation - # where it plays the role of a total. - # to do so, join alt_map by mapping rhs -> total, keeping the alt - # corresponding to the SOURCE row eq_name. left_join(alt_map, by = c("eq_name", "rhs" = "total")) %>% mutate(rhs = dplyr::coalesce(total_alt, rhs)) %>% select(total, rhs, eq_name) %>% @@ -171,14 +162,13 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ df_spannings <- df_metadata_long %>% mutate( - spanning = ifelse(is.na(hrc_spanning), spanning, toupper(hrc_spanning)), - indicator = ifelse(indicator %in% indic_not_in_eq & !is.na(hrc_indicator), - toupper(sub("hrc_", "", hrc_indicator)), indicator), + spanning = ifelse(is.na(hrc_spanning), spanning, toupper(hrc_spanning)), + indicator = ifelse(indicator %in% indic_not_in_eq & !is.na(hrc_indicator), + toupper(sub("hrc_", "", hrc_indicator)), indicator), hrc_indicator = ifelse(indicator %in% unique(equations_long$var), NA, hrc_indicator) ) df_spannings_eq <- df_spannings %>% - # delete all the non-word elements, specifically for the white spaces mutate(across(dplyr::where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% left_join(equations_long_full, by = c("indicator" = "var")) @@ -187,27 +177,17 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # --- CAS 1 : aucune ligne avec group → pas d'équation à traiter --- if (nrow(df_with_group) == 0) { + warning( + "Check the coherence of `df_eq_indicator` and `df_metadata`. + There is no table description in `df_metadata` with an indicator that is part of one of the equations provided in `df_eq_indicator`. + `df_eq_indicator` is useless here and will be ignored.") if (nrow(df_without_group) > 0) { if (all(is.na(df_without_group$hrc_indicator))) { return(list(df_without_group, df_variable_info)) } else { - df_no_eq_indicators <- df_without_group %>% - filter(!is.na(hrc_indicator)) %>% - dplyr::group_by(table_name) %>% - summarise( - field = last(field), - hrc_field = last(hrc_field), - spanning = paste0(toupper(last(hrc_indicator)), "^h"), - hrc_spanning = last(hrc_indicator), - indicator = last(indicator), - hrc_indicator = last(hrc_indicator) - ) %>% - bind_rows(df_spannings, .) %>% - arrange(table_name) + df_no_eq_indicators <- build_spanning_based_on_hrc_indicator(df_without_group,df_spannings) return(list(df_no_eq_indicators, df_variable_info)) } - } else { - return(list(df_spannings_eq, df_variable_info)) } } @@ -216,12 +196,12 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ group_by(group, table_name) |> summarise( spanning = list(sort(unique(spanning))), - side = first(side), - .groups = "drop" + side = first(side), + .groups = "drop" ) |> group_by(group) |> mutate( - all_sides = list(sort(unique(side))), + all_sides = list(sort(unique(side))), spanning_key = purrr::map_chr(spanning, paste, collapse = "|") ) |> ungroup() @@ -230,12 +210,12 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ distinct(group, spanning_key, spanning, all_sides) |> group_by(group, spanning_key) |> summarise( - spanning = list(spanning[[1]]), + spanning = list(spanning[[1]]), all_sides = list(all_sides[[1]]), - .groups = "drop" + .groups = "drop" ) |> mutate( - sides_couverts = purrr::map2(spanning, group, function(span_set, grp) { + covered_sides = purrr::map2(spanning, group, function(span_set, grp) { spanning_combination_group |> filter(group == grp) |> filter(purrr::map_lgl(spanning, ~ all(span_set %in% .x))) |> @@ -243,7 +223,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ sort() |> unique() }), - sides_manquants = purrr::map2(all_sides, sides_couverts, setdiff), + sides_manquants = purrr::map2(all_sides, covered_sides, setdiff), all_combinations = purrr::map_lgl(sides_manquants, ~ length(.x) == 0) ) |> unnest_wider(spanning, names_sep = "_") @@ -272,10 +252,10 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ left_join(totcode_equation, by = "group") %>% group_by(group, table_name_combined) %>% summarise( - table_name = first(table_name_combined), - field = last(field), - hrc_field = last(hrc_field), - spanning = if (length(unique(eq_name)) > 1) { + table_name = first(table_name_combined), + field = last(field), + hrc_field = last(hrc_field), + spanning = if (length(unique(eq_name)) > 1) { paste0(paste0(unique(toupper(eq_name)), collapse = "_"), "^h") } else { paste0(toupper(last(eq_name)), "^h") @@ -285,9 +265,9 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ } else { paste0("hrc_", toupper(last(eq_name)), ".totcode.", first(totcode)) }, - indicator = last(unit), + indicator = last(unit), hrc_indicator = last(hrc_indicator), - .groups = "drop" + .groups = "drop" ) %>% dplyr::distinct(group, table_name, spanning, hrc_spanning, .keep_all = TRUE) @@ -301,19 +281,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ df_indicators <- bind_rows(df_indicators, df_without_group) %>% arrange(table_name) return(list(df_indicators, df_variable_info)) } else { - df_no_eq_indicators <- df_without_group %>% - filter(!is.na(hrc_indicator)) %>% - dplyr::group_by(table_name) %>% - summarise( - field = last(field), - hrc_field = last(hrc_field), - spanning = paste0(toupper(last(hrc_indicator)), "^h"), - hrc_spanning = last(hrc_indicator), - indicator = last(indicator), - hrc_indicator = last(hrc_indicator) - ) %>% - bind_rows(df_spannings, .) %>% - arrange(table_name) + df_no_eq_indicators <- build_spanning_based_on_hrc_indicator(df_without_group,df_spannings) df_indicators <- bind_rows(df_indicators, df_no_eq_indicators) %>% arrange(table_name) return(list(df_indicators, df_variable_info)) } @@ -322,6 +290,53 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ } } +#' Build a data frame of indicators without equations +#' +#' Internal helper that aggregates rows from \code{df_without_group} that have +#' a non-\code{NA} \code{hrc_indicator}, and appends them to \code{df_spannings}. +#' The result is used to represent response variables that are linked by a +#' hierarchy but not by any equation. +#' +#' @param df_without_group A data frame containing the rows of +#' \code{df_spannings_eq} that do not belong to any equation group +#' (\code{group} is \code{NA}). Must contain the following columns: +#' \code{table_name}, \code{field}, \code{hrc_field}, \code{indicator}, +#' and \code{hrc_indicator}. +#' @param df_spannings A data frame derived from \code{df_metadata_long} with +#' renamed spanning and indicator variables. It is used as the base to which +#' the newly built rows are appended via \code{bind_rows}. +#' +#' @return A data frame with one row per \code{table_name} for the non-\code{NA} +#' \code{hrc_indicator} rows, appended to \code{df_spannings} and sorted by +#' \code{table_name}. The returned columns are: +#' \describe{ +#' \item{table_name}{Name of the table.} +#' \item{field}{Last value of \code{field} within the group.} +#' \item{hrc_field}{Last value of \code{hrc_field} within the group.} +#' \item{spanning}{Uppercase \code{hrc_indicator} suffixed with \code{^h}.} +#' \item{hrc_spanning}{Last value of \code{hrc_indicator} within the group.} +#' \item{indicator}{Last value of \code{indicator} within the group.} +#' \item{hrc_indicator}{Last value of \code{hrc_indicator} within the group.} +#' } +#' +#' @keywords internal +build_spanning_based_on_hrc_indicator <- function(df_without_group, df_spannings) { + df_without_group %>% + filter(!is.na(hrc_indicator)) %>% + dplyr::group_by(table_name) %>% + summarise( + field = last(field), + hrc_field = last(hrc_field), + spanning = paste0(toupper(last(hrc_indicator)), "^h"), + hrc_spanning = last(hrc_indicator), + indicator = last(indicator), + hrc_indicator = last(hrc_indicator) + ) %>% + bind_rows(df_spannings, .) %>% + arrange(table_name) +} + + #' Regroup tables within a group (i.e. equation / group of linked equations) #' based on spanning combination completeness #' @@ -378,7 +393,7 @@ regroup_tables <- function(df_group, spanning_combination_group) { spanning_by_table <- spanning_by_table |> left_join(span_comb, by = "spanning_key") - tables_complete <- spanning_by_table |> filter(all_combinations) |> pull(table_name) + tables_complete <- spanning_by_table |> filter(all_combinations) |> pull(table_name) tables_incomplete <- spanning_by_table |> filter(!all_combinations) |> pull(table_name) # Tables complètes -> fusionner par spanning_key identique @@ -388,10 +403,10 @@ regroup_tables <- function(df_group, spanning_combination_group) { left_join(spanning_by_table |> select(table_name, spanning_key), by = "table_name") |> group_by(across(-c(table_name, side, var_mapped, indicator))) |> summarise( - table_name = paste(sort(unique(table_name)), collapse = "."), - indicator = last(unit), + table_name = paste(sort(unique(table_name)), collapse = "."), + indicator = last(unit), initial_indicator = var_mapped[side == "total"][1], - .groups = "drop" + .groups = "drop" ) |> select(-spanning_key) } @@ -402,7 +417,7 @@ regroup_tables <- function(df_group, spanning_combination_group) { filter(table_name %in% tables_incomplete) |> mutate( initial_indicator = var_mapped[side == "total"][1], - indicator = unit + indicator = unit ) |> select(-c(side, var_mapped)) } From e8d7b072f0d0625c43d7ae0ec69a93d690c0f55b Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Thu, 25 Jun 2026 15:08:00 +0200 Subject: [PATCH 27/38] doc: delete comment explaining how to create graph of equations between indicators --- R/identify_hrc_with_eq.R | 5 ----- 1 file changed, 5 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 9a6f58a..72b2a9b 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -118,11 +118,6 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ comp_full <- igraph::components(g_full)$membership comp_df <- data.frame(var = names(comp_full), group = as.integer(comp_full), stringsAsFactors = FALSE) - ############################################################################## - # browser() # use this combined with "./rtauargus/dev/graphes_equations_objet_browser.R" - # to get the graphs showing indicators links based on the equations - ############################################################################## - # reformat parsed_equations in long format in order to join with df_metadata_long equations_long <- parsed_equations %>% mutate(across(c(total, starts_with("rhs")), trimws)) %>% From 7bd0244c882c41729abbb48b80125f660ceee9e3 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Thu, 25 Jun 2026 15:11:21 +0200 Subject: [PATCH 28/38] doc: delete comments to make code clearer --- R/identify_hrc_with_eq.R | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 72b2a9b..0920790 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -111,10 +111,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ select(total, rhs, eq_name) %>% dplyr::distinct() - # Build the full graph (including all copies) g_full <- graph_from_data_frame(links_full %>% select(total, rhs), directed = TRUE) - - # Compute components on g_full comp_full <- igraph::components(g_full)$membership comp_df <- data.frame(var = names(comp_full), group = as.integer(comp_full), stringsAsFactors = FALSE) @@ -123,24 +120,16 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ mutate(across(c(total, starts_with("rhs")), trimws)) %>% tidyr::pivot_longer( cols = c(total, starts_with("rhs")), - names_to = "side", # côté équation (total / rhs1 / rhs2...) + names_to = "side", values_to = "var" ) %>% filter(!is.na(var)) - # Update equations_long: - # associate the alternative variable (if present) and the corresponding group - # Notes: - # - equations_long contains the original variables (var) and eq_name; - # - we want to recover the "var" or "var_alt" version used in g_full. equations_long_full <- equations_long %>% - # join the correspondence eq_name + var (original total) -> total_alt (if any) left_join(alt_map, by = c("eq_name", "var" = "total")) %>% mutate(var_mapped = dplyr::coalesce(total_alt, var)) %>% select(-total_alt) %>% - # join the group computed on the full graph left_join(comp_df, by = c("var_mapped" = "var")) %>% - # for var_mapped without a group (isolated), keep NA or assign a single group mutate(group = as.integer(group)) # 'df_spannings' is a modified version of 'df_metadata_long' where: @@ -170,7 +159,6 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ df_with_group <- df_spannings_eq %>% filter(!is.na(group)) df_without_group <- df_spannings_eq %>% filter(is.na(group)) - # --- CAS 1 : aucune ligne avec group → pas d'équation à traiter --- if (nrow(df_with_group) == 0) { warning( "Check the coherence of `df_eq_indicator` and `df_metadata`. @@ -186,7 +174,6 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ } } - # --- CAS 2 : il y a des lignes avec group → traitement des équations --- spanning_combination_group <- df_with_group |> group_by(group, table_name) |> summarise( @@ -236,12 +223,12 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ tidyr::separate_rows(table_name, sep = "\\.") %>% select(table_name, table_name_combined, group) - totcode_equation <- df_with_group %>% # <-- df_with_group, pas df_spannings_eq + totcode_equation <- df_with_group %>% filter(side == "total") %>% group_by(group) %>% summarise(totcode = first(var_mapped), .groups = "drop") - df_eq_indicator_spannings <- df_with_group %>% # <-- df_with_group, pas df_spannings_eq + df_eq_indicator_spannings <- df_with_group %>% filter(!is.na(eq_name)) %>% left_join(table_group_mapping, by = c("table_name", "group")) %>% left_join(totcode_equation, by = "group") %>% @@ -376,12 +363,10 @@ build_spanning_based_on_hrc_indicator <- function(df_without_group, df_spannings regroup_tables <- function(df_group, spanning_combination_group) { current_group <- unique(df_group$group) - # spanning_key par table spanning_by_table <- df_group |> group_by(table_name) |> summarise(spanning_key = paste(sort(unique(spanning)), collapse = "|"), .groups = "drop") - # Récupérer le statut complet/incomplet par spanning_key span_comb <- spanning_combination_group |> filter(group == current_group) |> select(spanning_key, all_combinations) @@ -391,7 +376,6 @@ regroup_tables <- function(df_group, spanning_combination_group) { tables_complete <- spanning_by_table |> filter(all_combinations) |> pull(table_name) tables_incomplete <- spanning_by_table |> filter(!all_combinations) |> pull(table_name) - # Tables complètes -> fusionner par spanning_key identique df_merged <- if (length(tables_complete) > 0) { df_group |> filter(table_name %in% tables_complete) |> @@ -406,7 +390,6 @@ regroup_tables <- function(df_group, spanning_combination_group) { select(-spanning_key) } - # Tables incomplètes -> garder seules df_solo <- if (length(tables_incomplete) > 0) { df_group |> filter(table_name %in% tables_incomplete) |> From e412269c974db7971e69eb290456e3a7d8f4f212 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Fri, 26 Jun 2026 14:43:45 +0200 Subject: [PATCH 29/38] doc: add param to tab_to_treat() --- R/tab_to_treat.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/tab_to_treat.R b/R/tab_to_treat.R index bc41fdd..7224d2d 100644 --- a/R/tab_to_treat.R +++ b/R/tab_to_treat.R @@ -85,6 +85,12 @@ tab_to_treat <- function(list_independent_tables) { #' @param list_independent_tables A list of tibbles, typically the output of #' `grp_tab_in_cluster()` or `tab_to_treat()`. Each tibble contains metadata #' for tables grouped within a specific cluster. +#' @param list_hrc_identified A list returned by the `identify_hrc` function. The first +#' element of the list must be a data frame containing the variables: +#' - `field`: A grouping variable. +#' - `hrc_field`: The hierarchical counterpart of `field`. +#' - `indicator`: A variable used to link tables. +#' - `hrc_indicator`: The hierarchical counterpart of `indicator`. #' #' @return A single dataframe (`dfMetadata_to_treat`) with the following structure: #' - `cluster`: Identifier for the cluster each table belongs to. From 6b2a4581373fcd64f709f122b959511d853b3e78 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Fri, 26 Jun 2026 14:50:02 +0200 Subject: [PATCH 30/38] update doc --- R/globals.R | 5 +- R/identify_hrc_with_eq.R | 2 +- man/build_spanning_based_on_hrc_indicator.Rd | 40 +++++++++++++++ man/dataframe_result.Rd | 11 ++++- man/regroup_tables.Rd | 51 ++++++++++++++++++++ man/rtauargus-package.Rd | 1 + 6 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 man/build_spanning_based_on_hrc_indicator.Rd create mode 100644 man/regroup_tables.Rd diff --git a/R/globals.R b/R/globals.R index 24bc305..7fdf7ed 100644 --- a/R/globals.R +++ b/R/globals.R @@ -4,6 +4,7 @@ utils::globalVariables( "n_unique","column","unique_modalities","from.eg","to.eg","from","to","mutual_full", "Group","table_eg","spanning","hrc_spanning","spanning_old","tab_inclus", "starts_with","spanning_name","hrc_spanning_name","eq_indicator","rhs","total","term_number", - "eq_name","unit","var","n_total","total_alt","group", - ".") + "eq_name","unit","var","n_total","total_alt","group","initial_indicator","spanning_new", + "spanning_key","all_sides","side","sides_manquants","table_name_combined","var_mapped", + "totcode","all_combinations","covered_sides",".") ) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 0920790..de2a3d3 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -257,7 +257,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ select(table_name, field, hrc_field, indicator, hrc_indicator, everything()) %>% arrange(table_name) - # Lignes sans group (sans équation) — df_without_group réutilisé ici + # Tables without group (i.e. withtout group) if (nrow(df_without_group) > 0) { if (all(is.na(df_without_group$hrc_indicator))) { df_indicators <- bind_rows(df_indicators, df_without_group) %>% arrange(table_name) diff --git a/man/build_spanning_based_on_hrc_indicator.Rd b/man/build_spanning_based_on_hrc_indicator.Rd new file mode 100644 index 0000000..140d51d --- /dev/null +++ b/man/build_spanning_based_on_hrc_indicator.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/identify_hrc_with_eq.R +\name{build_spanning_based_on_hrc_indicator} +\alias{build_spanning_based_on_hrc_indicator} +\title{Build a data frame of indicators without equations} +\usage{ +build_spanning_based_on_hrc_indicator(df_without_group, df_spannings) +} +\arguments{ +\item{df_without_group}{A data frame containing the rows of +\code{df_spannings_eq} that do not belong to any equation group +(\code{group} is \code{NA}). Must contain the following columns: +\code{table_name}, \code{field}, \code{hrc_field}, \code{indicator}, +and \code{hrc_indicator}.} + +\item{df_spannings}{A data frame derived from \code{df_metadata_long} with +renamed spanning and indicator variables. It is used as the base to which +the newly built rows are appended via \code{bind_rows}.} +} +\value{ +A data frame with one row per \code{table_name} for the non-\code{NA} +\code{hrc_indicator} rows, appended to \code{df_spannings} and sorted by +\code{table_name}. The returned columns are: +\describe{ +\item{table_name}{Name of the table.} +\item{field}{Last value of \code{field} within the group.} +\item{hrc_field}{Last value of \code{hrc_field} within the group.} +\item{spanning}{Uppercase \code{hrc_indicator} suffixed with \code{^h}.} +\item{hrc_spanning}{Last value of \code{hrc_indicator} within the group.} +\item{indicator}{Last value of \code{indicator} within the group.} +\item{hrc_indicator}{Last value of \code{hrc_indicator} within the group.} +} +} +\description{ +Internal helper that aggregates rows from \code{df_without_group} that have +a non-\code{NA} \code{hrc_indicator}, and appends them to \code{df_spannings}. +The result is used to represent response variables that are linked by a +hierarchy but not by any equation. +} +\keyword{internal} diff --git a/man/dataframe_result.Rd b/man/dataframe_result.Rd index 30f6a00..001ef5d 100644 --- a/man/dataframe_result.Rd +++ b/man/dataframe_result.Rd @@ -4,12 +4,21 @@ \alias{dataframe_result} \title{Combine List of Dataframes into a Single Dataframe with Cluster Identification} \usage{ -dataframe_result(list_independent_tables) +dataframe_result(list_independent_tables, list_hrc_identified) } \arguments{ \item{list_independent_tables}{A list of tibbles, typically the output of \code{grp_tab_in_cluster()} or \code{tab_to_treat()}. Each tibble contains metadata for tables grouped within a specific cluster.} + +\item{list_hrc_identified}{A list returned by the \code{identify_hrc} function. The first +element of the list must be a data frame containing the variables: +\itemize{ +\item \code{field}: A grouping variable. +\item \code{hrc_field}: The hierarchical counterpart of \code{field}. +\item \code{indicator}: A variable used to link tables. +\item \code{hrc_indicator}: The hierarchical counterpart of \code{indicator}. +}} } \value{ A single dataframe (\code{dfMetadata_to_treat}) with the following structure: diff --git a/man/regroup_tables.Rd b/man/regroup_tables.Rd new file mode 100644 index 0000000..53f88a3 --- /dev/null +++ b/man/regroup_tables.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/identify_hrc_with_eq.R +\name{regroup_tables} +\alias{regroup_tables} +\title{Regroup tables within a group (i.e. equation / group of linked equations) +based on spanning combination completeness} +\usage{ +regroup_tables(df_group, spanning_combination_group) +} +\arguments{ +\item{df_group}{A tibble containing the rows of a single group from +\code{df_with_group}. Must contain columns: \code{table_name}, +\code{spanning}, \code{side}, \code{var_mapped}, \code{indicator}, +\code{unit}, and \code{group}.} + +\item{spanning_combination_group}{A tibble produced by the +\code{spanning_combination_group} pipeline, containing one row per +(group, spanning_key) combination. Must contain columns: \code{group}, +\code{spanning_key}, and \code{all_combinations} (logical indicating +whether the spanning combination covers all sides of the equation).} +} +\value{ +A tibble with one row per (merged or solo) table cluster and +spanning, containing the following columns (among others): +\describe{ +\item{table_name}{Dot-separated list of merged table names (e.g. +\code{"T7.T9.T11"}) for complete combinations, or the original +table name for incomplete ones.} +\item{indicator}{The unit value shared across the merged tables.} +\item{initial_indicator}{The \code{var_mapped} value of the \code{total} +side, used to track the original indicator before merging.} +} +} +\description{ +For a given group of tables, this function identifies which tables cover all +sides of an equation (total, rhs1, rhs2, ...) for their spanning combination, +and which do not. Tables with complete combinations are merged into a single +row; tables with incomplete combinations are kept as standalone rows with +their original spannings. +} +\examples{ +\dontrun{ +list_groups <- split(df_with_group, df_with_group$group) + +df_eq_initial_spannings <- purrr::map(list_groups, function(df_group) { + regroup_tables(df_group, spanning_combination_group) +}) |> + purrr::compact() |> + dplyr::bind_rows() +} +} diff --git a/man/rtauargus-package.Rd b/man/rtauargus-package.Rd index 29e9789..5f1c96b 100644 --- a/man/rtauargus-package.Rd +++ b/man/rtauargus-package.Rd @@ -23,6 +23,7 @@ Useful links: Authors: \itemize{ + \item Julien Jamme \email{julien.jamme@insee.fr} \item Pierre-Yves Berrard \email{pierre-yves.berrard@insee.fr} \item Nathanaël Rastout \email{nathanael.rastout@insee.fr} \item Jeanne Pointet From 07ce06d307a4a34355cd27882a05ab9a26502207 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Fri, 26 Jun 2026 15:00:27 +0200 Subject: [PATCH 31/38] rename tests and update DESCRIPTION --- DESCRIPTION | 2 +- tests/testthat/test_analyse_metadata.R | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 77da1a1..09d2c8f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -67,7 +67,6 @@ Description: Protects tables by calling the Tau-Argus software from R. License: MIT + file LICENSE Encoding: UTF-8 LazyData: true -RoxygenNote: 7.3.3 VignetteBuilder: knitr URL: https://inseefrlab.github.io/rtauargus, https://github.com/inseefrlab/rtauargus, @@ -75,3 +74,4 @@ URL: https://inseefrlab.github.io/rtauargus, BugReports: https://github.com/inseefrlab/rtauargus/issues Roxygen: list(markdown = TRUE) StagedInstall: no +Config/roxygen2/version: 8.0.0 diff --git a/tests/testthat/test_analyse_metadata.R b/tests/testthat/test_analyse_metadata.R index 33b8cdb..ab0f96e 100644 --- a/tests/testthat/test_analyse_metadata.R +++ b/tests/testthat/test_analyse_metadata.R @@ -212,7 +212,7 @@ test_that("indicators equation", { } ) -# Nommer test 1 ---------------------------------------------------------------- +# All indicators in the same equation broken down by the same spanning variable ---- answer <- data.frame( cluster = c("france_entreprises_2023.EUR"), table_name = c("T11.T7.T9"), @@ -239,7 +239,7 @@ test_that("meme_var_crois_1", { } ) -# Nommer test 2 ---------------------------------------------------------------- +# All indicators in the same equation broken down by the same spanning variables ---- answer <- data.frame( cluster = c("france_entreprises_2023.EUR","france_entreprises_2023.EUR"), table_name = c("T10.T12.T8","T11.T7.T9"), @@ -268,7 +268,7 @@ test_that("meme_var_crois_2", { } ) -# Nommer test 3 ---------------------------------------------------------------- +# Two equations, all indiicators broken down by the same spanning variable ----- answer <- data.frame( cluster = c("france_entreprises_2023.EUR","france_entreprises_2023.EUR"), table_name = c("T10.T11.T12.T7.T8.T9","T4.T5.T6"), @@ -296,7 +296,8 @@ test_that("meme_var_crois_1_deux_eq", { } ) -# Nommer test 4 ---------------------------------------------------------------- +# Two equations, all indicators in each equations are broken down by the same +# spanning variables ----------------------------------------------------------- answer <- data.frame( cluster = rep("france_entreprises_2023.EUR"), table_name = c("T10.T12.T8","T11.T7.T9","T4.T5.T6"), @@ -329,7 +330,8 @@ test_that("meme_var_crois_2_deux_eq", { } ) -# Nommer test 5 ---------------------------------------------------------------- +# One equation, but the indicators are not broken down by the same spanning +# variables (only one spanning variable by table) ------------------------------ answer <- data.frame( cluster = rep("france_entreprises_2023.EUR"), table_name = c("T11","T7.T9"), @@ -359,7 +361,8 @@ test_that("pas_meme_var_crois_1", { } ) -# Nommer test 6 ---------------------------------------------------------------- +# One equation, but the indicators are not broken down by the same spanning +# variables (one or two spanning variable by table) ------------------------------ answer <- data.frame( cluster = rep("france_entreprises_2023.EUR"), table_name = c("T1.T2","T1.T2.T3"), From e1da269fb96e619e5761fdc95625cd00e3bab180 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Fri, 26 Jun 2026 15:11:55 +0200 Subject: [PATCH 32/38] doc: update version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 09d2c8f..f6fdfaa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: rtauargus Type: Package Title: Using Tau-Argus from R Language: fr -Version: 1.3.4 +Version: 1.3.5 Depends: R (>= 3.5.0) Imports: purrr (>= 0.2), From 73c8554e80b505467858826a06fc399f12642186 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Tue, 30 Jun 2026 09:32:09 +0200 Subject: [PATCH 33/38] feat: add group name to treat tic example --- R/identify_hrc_with_eq.R | 3 ++- R/tab_to_treat.R | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index de2a3d3..ee4db03 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -254,7 +254,8 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ dplyr::distinct(group, table_name, spanning, hrc_spanning, .keep_all = TRUE) df_indicators <- bind_rows(df_eq_initial_spannings, df_eq_indicator_spannings) %>% - select(table_name, field, hrc_field, indicator, hrc_indicator, everything()) %>% + select(table_name, field, hrc_field, indicator, hrc_indicator, spanning, hrc_spanning, group) %>% + mutate(table_name = paste(table_name,"group",group,sep="_")) %>% select(-c(group)) %>% unique() %>% arrange(table_name) # Tables without group (i.e. withtout group) diff --git a/R/tab_to_treat.R b/R/tab_to_treat.R index 7224d2d..30e8d03 100644 --- a/R/tab_to_treat.R +++ b/R/tab_to_treat.R @@ -140,7 +140,7 @@ dataframe_result <- function(list_independent_tables, list_hrc_identified) { dataframe_metadata <- purrr::imap_dfr(list_independent_tables, function(tibble, tibble_name) { tibble %>% mutate(cluster = tibble_name) }) - # browser() + # If the initial_indicator column exists in list_hrc_identified, # replace indicator with initial_indicator whenever initial_indicator is not NA if ("initial_indicator" %in% names(list_hrc_identified[[1]])) { From 5086b84fbb3b4d02491a17c2b04ebe1e5bb9056e Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Tue, 30 Jun 2026 11:29:22 +0200 Subject: [PATCH 34/38] feat: keep intial_indicator + no group name in table_name but the tic pbm still handled --- R/identify_hrc_with_eq.R | 12 +++++++++++- R/tab_to_treat.R | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index ee4db03..60df65c 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -255,9 +255,19 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ df_indicators <- bind_rows(df_eq_initial_spannings, df_eq_indicator_spannings) %>% select(table_name, field, hrc_field, indicator, hrc_indicator, spanning, hrc_spanning, group) %>% - mutate(table_name = paste(table_name,"group",group,sep="_")) %>% select(-c(group)) %>% unique() %>% + mutate(table_name = paste(table_name, "group", group, sep = "_")) %>% + select(-group) %>% + unique() %>% arrange(table_name) + df_initial_indicator <- bind_rows(df_eq_initial_spannings, df_eq_indicator_spannings) %>% + mutate(table_name = paste(table_name, "group", group, sep = "_")) %>% + group_by(table_name) %>% + summarise(initial_indicator = first(na.omit(initial_indicator)), .groups = "drop") + + df_indicators <- df_indicators %>% + left_join(df_initial_indicator, by = "table_name") + # Tables without group (i.e. withtout group) if (nrow(df_without_group) > 0) { if (all(is.na(df_without_group$hrc_indicator))) { diff --git a/R/tab_to_treat.R b/R/tab_to_treat.R index 30e8d03..3dd2aac 100644 --- a/R/tab_to_treat.R +++ b/R/tab_to_treat.R @@ -157,6 +157,7 @@ dataframe_result <- function(list_independent_tables, list_hrc_identified) { } dataframe_metadata <- dataframe_metadata %>% + mutate(table_name = sub("_group_.*", "", table_name)) %>% select( cluster, table_name, From dd26e292a667788c207d664ebc096a84533b4927 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Tue, 30 Jun 2026 16:41:21 +0200 Subject: [PATCH 35/38] fix: handle case where inital_indicator = NA + only removing _group_xx not everything after --- R/identify_hrc_with_eq.R | 8 +++++++- R/tab_to_treat.R | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 60df65c..1e25d80 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -263,7 +263,13 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ df_initial_indicator <- bind_rows(df_eq_initial_spannings, df_eq_indicator_spannings) %>% mutate(table_name = paste(table_name, "group", group, sep = "_")) %>% group_by(table_name) %>% - summarise(initial_indicator = first(na.omit(initial_indicator)), .groups = "drop") + summarise( + initial_indicator = { + x <- na.omit(initial_indicator) + if (length(x) == 0) NA_character_ else x[1] + }, + .groups = "drop" + ) df_indicators <- df_indicators %>% left_join(df_initial_indicator, by = "table_name") diff --git a/R/tab_to_treat.R b/R/tab_to_treat.R index 3dd2aac..856c03b 100644 --- a/R/tab_to_treat.R +++ b/R/tab_to_treat.R @@ -157,7 +157,7 @@ dataframe_result <- function(list_independent_tables, list_hrc_identified) { } dataframe_metadata <- dataframe_metadata %>% - mutate(table_name = sub("_group_.*", "", table_name)) %>% + mutate(table_name = gsub("_group_[0-9]+", "", table_name)) %>% select( cluster, table_name, From 426160a3f33f0accbb54ca508cdfde537d03d8f4 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Tue, 30 Jun 2026 18:33:45 +0200 Subject: [PATCH 36/38] fix: add package in fornt of na.omit() --- R/identify_hrc_with_eq.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 1e25d80..7d99042 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -265,7 +265,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ group_by(table_name) %>% summarise( initial_indicator = { - x <- na.omit(initial_indicator) + x <- stats::na.omit(initial_indicator) if (length(x) == 0) NA_character_ else x[1] }, .groups = "drop" From 99705c4369f24e1aae95b057da2a2171d851dfd6 Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Tue, 30 Jun 2026 18:34:30 +0200 Subject: [PATCH 37/38] fix: define regroup_tables() as an internal function --- R/identify_hrc_with_eq.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 7d99042..0bc342a 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -377,6 +377,8 @@ build_spanning_based_on_hrc_indicator <- function(df_without_group, df_spannings #' purrr::compact() |> #' dplyr::bind_rows() #' } +#' +#' @keywords internal regroup_tables <- function(df_group, spanning_combination_group) { current_group <- unique(df_group$group) From 1b52031f1abb1c68d19f0f01340bccea265d5f7f Mon Sep 17 00:00:00 2001 From: Clara Baudry Date: Tue, 30 Jun 2026 18:40:47 +0200 Subject: [PATCH 38/38] fix: change documentation so that regroup_tables() is referenced as internal --- man/regroup_tables.Rd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/man/regroup_tables.Rd b/man/regroup_tables.Rd index 53f88a3..78f6df3 100644 --- a/man/regroup_tables.Rd +++ b/man/regroup_tables.Rd @@ -48,4 +48,6 @@ df_eq_initial_spannings <- purrr::map(list_groups, function(df_group) { purrr::compact() |> dplyr::bind_rows() } + } +\keyword{internal}