Merge pull request #4 from JulienT01/main

TimotheeMathieu · web-flow · commit b0c9f6958ee4 · 2025-08-13T10:36:08.000+02:00
Improve tests and code coverage
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,10 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+#vsCode config
+.vscode/
+
+#tests artefact
+test.pdf
+examples/.adastop_comparator.pkl
diff --git a/adastop/cli.py b/adastop/cli.py
@@ -61,10 +61,9 @@ def compare(ctx, input_file, n_groups, size_group, n_permutations, alpha, beta,
             if i in comparator.current_comparisons.ravel():
                 names.append(comparator.agent_names[i])
 
-
         Z = [np.hstack([comparator.eval_values[agent], df[agent]]) for agent in names]
-        if len(Z[0]) > comparator.K * n_fits_per_group:
-            raise ValueError('Error: you tried to use more group than what was initially declared, this is not allowed by the theory.')
+        if len(names) == 0:
+            raise ValueError('Error: you tried to use more group than necessary. Use adastop status to see current status for more info.')
         assert "continue" in list(comparator.decisions.values()), "Test finished at last iteration."
 
     else:
diff --git a/adastop/compare_agents.py b/adastop/compare_agents.py
@@ -212,7 +212,7 @@ def partial_compare(self, eval_values, verbose=True):
         if self.agent_names is None:
             self.agent_names = list(eval_values.keys())
 
-        Z = [eval_values[agent] for agent in self.agent_names]
+        Z = [np.array(eval_values[agent]) for agent in self.agent_names]
         n_managers = len(Z)
         if isinstance(self.n,int):
             self.n = np.array([self.n]*n_managers)
@@ -256,13 +256,13 @@ def partial_compare(self, eval_values, verbose=True):
 
             # Compute admissible values, i.e. values that would not be rejected nor accepted.
             admissible_values_sup = values[
-                self.level_spent + icumulative_probas <= clevel
+                self.level_spent + icumulative_probas < clevel
             ]
 
             if len(admissible_values_sup) > 0:
                 bk_sup = admissible_values_sup[0]  # the minimum admissible value
                 level_to_add = icumulative_probas[
-                    self.level_spent + icumulative_probas <= clevel
+                    self.level_spent + icumulative_probas < clevel
                 ][0]
             else:
                 # This case is possible if clevel-self.level_spent <= 1/ self.normalization (smallest proba possible),
@@ -272,7 +272,7 @@ def partial_compare(self, eval_values, verbose=True):
 
             cumulative_probas = np.arange(len(values)) / self.normalization  # corresponds to P(T < t)
             admissible_values_inf = values[
-                self.power_spent + cumulative_probas < dlevel
+                self.power_spent + cumulative_probas <= dlevel
             ]
 
             if len(admissible_values_inf) > 0:
diff --git a/docs/tutorials.md b/docs/tutorials.md
@@ -15,6 +15,17 @@ The command line interface takes csv files as input. Each csv file must contain
 Below, we give an example based on files containing the evaluations of PPO,DDPG,SAC,TRPO, four Deep Reinforcement Learning algorithmes, given in the \`examples\` directory of the main repository.
 
 
+## Installation
+
+To install adastop, use pip:
+```bash
+pip install adastop
+```
+
+This will automatically install the command line interface as well as the python library.
+
+
+
 ## Help for cli tool
 
 The AdaStop algorithm is initialized with the first test done through \`adastop compare\` and the current state of AdaStop is then saved in a pickle file. The help of \`adastop\` command line can be obtained with the following:
@@ -90,7 +101,7 @@ The input format of adastop is under the form of a csv file containing the score
 
 Let us launch AdaStop on this first batch of data.
 
-First, we clean up the corrent directory of any litter files that could have been spawned by a previous usage of \`adastop\` (if you never used \`adastop\` before, this command will not have any effect).
+First, we clean up the current directory of any litter files that could have been spawned by a previous usage of \`adastop\` (if you never used \`adastop\` before, this command will not have any effect).
 
 ```bash
 adastop reset . # reset the state of the comparator (remove hidden pickle file)
@@ -144,14 +155,14 @@ adastop compare --n-groups 5 --size-group 5  walker5.csv
 
 Test is finished, decisions are
 
-|   | Agent1 vs Agent2 | mean Agent1 | mean Agent2 | mean diff | std Agent 1 | std Agent 2 | decisions |
-|--- |---------------- |----------- |----------- |--------- |----------- |----------- |--------- |
-| 0 | PPO vs DDPG      | 2901.53     | 884.119     | 2017.41   | 1257.93     | 535.74      | larger    |
-| 0 | PPO vs SAC       | 2901.53     | 4543.4      | -1641.87  | 1257.93     | 432.13      | smaller   |
-| 0 | PPO vs TRPO      | 2901.53     | 1215.42     | 1686.11   | 1257.93     | 529.672     | larger    |
-| 0 | DDPG vs SAC      | 884.119     | 4543.4      | -3659.28  | 535.74      | 432.13      | smaller   |
-| 0 | DDPG vs TRPO     | 884.119     | 1215.42     | -331.297  | 535.74      | 529.672     | smaller   |
-| 0 | SAC vs TRPO      | 4543.4      | 1215.42     | 3327.98   | 432.13      | 529.672     | larger    |
+| Agent1 vs Agent2 | mean Agent1 | mean Agent2 | mean diff | std Agent 1 | std Agent 2 | decisions |
+|----------------- |------------ |------------ |---------- |------------ |------------ |---------- |
+| PPO vs DDPG      | 2901.53     | 884.119     | 2017.41   | 1257.93     | 535.74      | larger    |
+| PPO vs SAC       | 2901.53     | 4543.4      | -1641.87  | 1257.93     | 432.13      | smaller   |
+| PPO vs TRPO      | 2901.53     | 1215.42     | 1686.11   | 1257.93     | 529.672     | larger    |
+| DDPG vs SAC      | 884.119     | 4543.4      | -3659.28  | 535.74      | 432.13      | smaller   |
+| DDPG vs TRPO     | 884.119     | 1215.42     | -331.297  | 535.74      | 529.672     | smaller   |
+| SAC vs TRPO      | 4543.4      | 1215.42     | 3327.98   | 432.13      | 529.672     | larger    |
 
 Comparator Saved
 
diff --git a/docs/tutorials.org b/docs/tutorials.org
@@ -15,6 +15,21 @@ Please note that if, in the process of the algorithm, all the comparisons for on
 
 Below, we give an example based on files containing the evaluations of PPO,DDPG,SAC,TRPO, four Deep Reinforcement Learning algorithmes, given in the =examples= directory of the main repository.
 
+
+
+
+** Installation
+
+To install adastop, use pip:
+
+#+begin_src bash :session *shell* :results verbatim :exports both
+pip install adastop
+#+end_src
+
+This will automatically install the command line interface as well as the python library.
+
+
+
 ** Help for cli tool 
 
 The AdaStop algorithm is initialized with the first test done through =adastop compare= and the current state of AdaStop is then saved in a pickle file. The help of =adastop= command line can be obtained with the following:
@@ -47,7 +62,7 @@ The input format of adastop is under the form of a csv file containing the score
 
 Let us launch AdaStop on this first batch of data. 
 
-First, we clean up the corrent directory of any litter files that could have been spawned by a previous usage of =adastop= (if you never used =adastop= before, this command will not have any effect).
+First, we clean up the current directory of any litter files that could have been spawned by a previous usage of =adastop= (if you never used =adastop= before, this command will not have any effect).
 
 #+begin_src bash :session *shell* :results verbatim :exports both 
 adastop reset . # reset the state of the comparator (remove hidden pickle file)
diff --git a/docs/user_guide.md b/docs/user_guide.md
@@ -60,7 +60,7 @@ Then, once you did the comparison on the first file, you can use iteratively `ad
 
 #### Choice of comparisons
 
-In adastopn, one can choose which comparisons are done. The default is to do all the pairwise comparisons between two algorithms. In practice, it is sometimes sufficient to compare to only one of them, a benchmark, for this the `--compare-to-first` argument can be used. For a more fine-grained control on which comparison to do, the python API can take the comparisons as input.
+In adastop, one can choose which comparisons are done. The default is to do all the pairwise comparisons between two algorithms. In practice, it is sometimes sufficient to compare to only one of them, a benchmark, for this the `--compare-to-first` argument can be used. For a more fine-grained control on which comparison to do, the python API can take the comparisons as input.
 
 **Remark**: it is not statistically ok to execute adastop several times and interpret the result as though it was only one test, if adastop is run several times this is multiple testing and some calibration has to be done. Instead, it is better to do all the comparisons at the same time, running the adastop algorithm only once, and adastop will handle the multiplicity of hypotheses by itself.
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,30 +1,57 @@
 import pytest
 from click.testing import CliRunner
 from adastop.cli import adastop
+import os
 
 # we reuse a bit of pytest's own testing machinery, this should eventually come
 import subprocess
 
 
 def test_cli():
     runner = CliRunner()
+    test_pdf_path = "test.pdf"
+
+    if os.path.exists(test_pdf_path):
+        os.remove(test_pdf_path)
+
     result = runner.invoke(adastop, ['reset', 'examples'])
     assert result.exit_code == 0
     for j in range(1,6):
-        
-        result = runner.invoke(adastop, ['compare', 'examples/walker'+str(j)+'.csv'])
+        result = runner.invoke(adastop, ['compare', "--seed", "1",  'examples/walker'+str(j)+'.csv'])
         assert result.exit_code == 0
 
-    result = runner.invoke(adastop, ['compare', 'examples/walker3.csv'])
+    result = runner.invoke(adastop, ['compare',"--seed", "1",  'examples/walker3.csv'])
     assert result.exit_code == 1
+    assert result.exception.args[0] == 'Error: you tried to use more group than necessary. Use adastop status to see current status for more info.'
 
-    result = runner.invoke(adastop, ['plot', 'examples', "test.pdf"])
+
+    result = runner.invoke(adastop, ['plot', 'examples', test_pdf_path])
     assert result.exit_code == 0
     result = runner.invoke(adastop, ['status', 'examples'])
     assert result.exit_code == 0
+    assert os.path.exists(test_pdf_path) == True
+
 
     result = runner.invoke(adastop, ['reset', 'examples'])
     assert result.exit_code == 0
         
-    result = runner.invoke(adastop, ['compare', "--compare-to-first", 'examples/walker1.csv'])
+    result = runner.invoke(adastop, ['compare', "--compare-to-first","--seed", "1",  'examples/walker1.csv'])
     assert result.exit_code == 0
+
+
+
+def test_plot_no_comparator_save_file():
+    runner = CliRunner()
+    runner.invoke(adastop, ['reset', 'examples'])
+
+    result = runner.invoke(adastop, ['plot', 'examples', "test.pdf"])
+    assert result.exit_code == 1
+    assert result.exception.args[0] == 'Comparator save file not found.'
+
+def test_status_no_comparator_save_file():
+    runner = CliRunner()
+    runner.invoke(adastop, ['reset', 'examples'])
+
+    result = runner.invoke(adastop, ['status', 'examples'])
+    assert result.exit_code == 1
+    assert result.exception.args[0] == 'Comparator save file not found.'
diff --git a/tests/test_compare_agents.py b/tests/test_compare_agents.py
@@ -5,16 +5,27 @@
 B = 5000
 alpha = 0.05
 n_runs = 10
+seed = 42
 
-def test_runtime():
+def test_partial_compare():
+    rng = np.random.RandomState(seed)
     idxs = []
     comparator = MultipleAgentsComparator(n=3, K=3, B=B,  alpha=alpha, seed=42, beta = 0.01)
-    evals = {"Agent "+str(k):np.random.normal(size=3) for k in range(3)}
+    evals = {"Agent "+str(k): rng.normal(size=3) for k in range(3)}
     comparator.partial_compare(evals)
+
+
+def test_partial_compare_not_enough_points():
+    comparator = MultipleAgentsComparator(n=3, K=3, B=5000,  alpha=-1e-5, seed=42, beta = 0.01)
+    evals = {"Agent 1":np.array([0,0,0]),"Agent 2":np.array([0,0,0]),"Agent 3":np.array([0,0,0])}
+    comparator.partial_compare(evals)
+
     
 
 @pytest.mark.parametrize("K,n", [(10,2),(5,3), (3, 5), (1, 15)])
 def test_type1(K,n):
+    rng = np.random.RandomState(seed)
+
     idxs = []
     n_agents = 3
     for M in range(n_runs):
@@ -23,16 +34,18 @@ def test_type1(K,n):
         while not comparator.is_finished:
             if len(evals) >0:
                 for k in range(n_agents):
-                    evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] ,np.random.normal(size=n)])
+                    evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] , rng.normal(size=n)])
             else:
-                evals = {"Agent "+str(k): np.random.normal(size=n) for k in range(n_agents)}
+                evals = {"Agent "+str(k): rng.normal(size=n) for k in range(n_agents)}
             comparator.partial_compare(evals)
         idxs.append(not("equal" in comparator.decisions.values()))
         print(comparator.get_results())
     assert np.mean(idxs) < 2*alpha + 1/4/(np.sqrt(n_runs)), "type 1 error seems to be too large."
         
 @pytest.mark.parametrize("K,n", [(5,3), (3, 5), (1, 15)])
 def test_type1_large_beta(K,n):
+    rng = np.random.RandomState(seed)
+
     idxs = []
     n_agents = 3
     for M in range(n_runs):
@@ -41,16 +54,18 @@ def test_type1_large_beta(K,n):
         while not comparator.is_finished:
             if len(evals) >0:
                 for k in range(n_agents):
-                    evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] ,np.random.normal(size=n)])
+                    evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] , rng.normal(size=n)])
             else:
-                evals = {"Agent "+str(k): np.random.normal(size=n) for k in range(n_agents)}
+                evals = {"Agent "+str(k): rng.normal(size=n) for k in range(n_agents)}
             comparator.partial_compare(evals)
         idxs.append(not("equal" in comparator.decisions.values()))
         print(comparator.get_results())
     assert np.mean(idxs) < 2*alpha + 1/4/(np.sqrt(n_runs)), "type 1 error seems to be too large."
         
 @pytest.mark.parametrize("K,n", [(3, 5), (1, 15)])
 def test_type2(K,n):
+    rng = np.random.RandomState(seed)
+
     idxs = []
     n_agents = 2
     for M in range(n_runs):
@@ -59,9 +74,9 @@ def test_type2(K,n):
         while not comparator.is_finished:
             if len(evals) >0:
                 for k in range(n_agents):
-                    evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] ,np.random.normal(size=n)+2*k])
+                    evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] , rng.normal(size=n)+2*k])
             else:
-                evals = {"Agent "+str(k): np.random.normal(size=n)+2*k for k in range(n_agents)}
+                evals = {"Agent "+str(k): rng.normal(size=n)+2*k for k in range(n_agents)}
             comparator.partial_compare(evals)
         idxs.append(not("equal" in comparator.decisions.values()))
     assert np.mean(idxs) > 0.3, "type 2 error seems to be too large."
diff --git a/tests/test_plot.py b/tests/test_plot.py
@@ -76,3 +76,39 @@ def test_plot_sota_noteq():
     # plt.savefig('fig2.pdf')
     fig, axes= plt.subplots(1,2)
     comparator.plot_results_sota(axes=axes)
+
+
+
+def test_plot_noteq2():
+    n_agents = 3
+    comparator = MultipleAgentsComparator(n=10, K=K, B=B,  alpha=alpha, seed=42, beta = 0.01)
+    evals = {}
+    while not comparator.is_finished:
+        if len(evals) >0:
+            for k in range(n_agents):
+                evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] , np.abs(2*K-k)+np.random.normal(size=10)])
+        else:
+            evals = {"Agent "+str(k): np.random.normal(size=10)+np.abs(2*K-k) for k in range(n_agents)}
+        comparator.partial_compare(evals)
+    # plt.savefig('fig2.pdf')
+    fig, axes= plt.subplots(1,2)
+    comparator.plot_results(axes=axes)
+
+def test_plot_sota_noteq2():
+    n_agents = 3
+    comparisons = np.array([(0,i) for i in [1,2]])
+    comparator = MultipleAgentsComparator(n=10, K=K, B=B,  alpha=alpha, 
+                                          comparisons=comparisons, seed=42, beta = 0.01)
+    evals = {}
+    while not comparator.is_finished:
+        if len(evals) >0:
+            for k in range(n_agents):
+                evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] ,np.random.normal(size=10)+np.abs(2*K-k)])
+        else:
+            evals = {"Agent "+str(k): np.random.normal(size=10)+np.abs(2*K-k) for k in range(n_agents)}
+        comparator.partial_compare(evals)
+    comparator.plot_results_sota()
+    # plt.savefig('fig2.pdf')
+    fig, axes= plt.subplots(1,2)
+    comparator.plot_results_sota(axes=axes)
+