diff --git a/lib/benchmark_runner.rb b/lib/benchmark_runner.rb index c50bedea..5187486b 100644 --- a/lib/benchmark_runner.rb +++ b/lib/benchmark_runner.rb @@ -48,7 +48,7 @@ def write_csv(output_path, ruby_descriptions, table) end # Build output text string with metadata, table, and legend - def build_output_text(ruby_descriptions, table, format, bench_failures, include_rss: false, include_gc: false) + def build_output_text(ruby_descriptions, table, format, bench_failures, include_rss: false, include_gc: false, include_pvalue: false) base_name, *other_names = ruby_descriptions.keys output_str = +"" @@ -73,7 +73,9 @@ def build_output_text(ruby_descriptions, table, format, bench_failures, include_ output_str << "- sweep #{base_name}/#{name}: ratio of GC sweeping time. Higher is better for #{name}. Above 1 represents faster sweeping.\n" end end - output_str << "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)\n" + if include_pvalue + output_str << "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)\n" + end end output_str diff --git a/lib/benchmark_runner/cli.rb b/lib/benchmark_runner/cli.rb index 44127776..46eb65c3 100644 --- a/lib/benchmark_runner/cli.rb +++ b/lib/benchmark_runner/cli.rb @@ -109,7 +109,7 @@ def run BenchmarkRunner.write_csv(output_path, ruby_descriptions, table) # Save the output in a text file that we can easily refer to - output_str = BenchmarkRunner.build_output_text(ruby_descriptions, table, format, bench_failures, include_rss: args.rss, include_gc: builder.include_gc?) + output_str = BenchmarkRunner.build_output_text(ruby_descriptions, table, format, bench_failures, include_rss: args.rss, include_gc: builder.include_gc?, include_pvalue: args.pvalue) out_txt_path = output_path + ".txt" File.open(out_txt_path, "w") { |f| f.write output_str } diff --git a/lib/results_table_builder.rb b/lib/results_table_builder.rb index a08c7494..75f3ba9c 100644 --- a/lib/results_table_builder.rb +++ b/lib/results_table_builder.rb @@ -174,7 +174,7 @@ def build_ratio_columns(row, base_t0, other_t0s, base_t, other_ts) row.concat(ratio_1sts) other_ts.each do |other_t| - pval = Stats.welch_p_value(base_t, other_t) + pval = @include_pvalue ? Stats.welch_p_value(base_t, other_t) : nil row << format_ratio(mean(base_t) / mean(other_t), pval) if @include_pvalue row << format_p_value(pval) @@ -207,7 +207,7 @@ def gc_ratio(base, other) mean(other) == 0.0 return "N/A" end - pval = Stats.welch_p_value(base, other) + pval = @include_pvalue ? Stats.welch_p_value(base, other) : nil format_ratio(mean(base) / mean(other), pval) end diff --git a/test/benchmark_runner_test.rb b/test/benchmark_runner_test.rb index a8eb812f..e475ed7f 100644 --- a/test/benchmark_runner_test.rb +++ b/test/benchmark_runner_test.rb @@ -387,6 +387,25 @@ assert_includes result, 'Legend:' assert_includes result, '- ruby-yjit 1st itr: ratio of ruby-base/ruby-yjit time for the first benchmarking iteration.' assert_includes result, '- ruby-base/ruby-yjit: ratio of ruby-base/ruby-yjit time. Higher is better for ruby-yjit. Above 1 represents a speedup.' + refute_includes result, "p < 0.001" + end + + it 'includes p-value legend when include_pvalue is true' do + ruby_descriptions = { + 'ruby-base' => 'ruby 3.3.0', + 'ruby-yjit' => 'ruby 3.3.0 +YJIT' + } + table = [ + ['bench', 'ruby-base (ms)', 'stddev (%)', 'ruby-yjit (ms)', 'stddev (%)'], + ['fib', '100.0', '5.0', '50.0', '3.0'] + ] + format = ['%s', '%.1f', '%.1f', '%.1f', '%.1f'] + bench_failures = {} + + result = BenchmarkRunner.build_output_text( + ruby_descriptions, table, format, bench_failures, include_pvalue: true + ) + assert_includes result, "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)" end diff --git a/test/results_table_builder_test.rb b/test/results_table_builder_test.rb index 153c01a7..a79d5288 100644 --- a/test/results_table_builder_test.rb +++ b/test/results_table_builder_test.rb @@ -425,7 +425,7 @@ assert_equal '', table[1].last end - it 'always shows significance symbol but omits verbose columns without --pvalue' do + it 'omits significance symbols and p-value columns without --pvalue' do executable_names = ['ruby', 'ruby-yjit'] bench_data = { 'ruby' => { @@ -452,7 +452,41 @@ table, _format = builder.build refute_includes table[0], 'p-value' refute_includes table[0], 'sig' - assert_match(/\(\*{1,3}\)$/, table[1].last) + ratio_cell = table[1].last + refute_match(/\*/, ratio_cell) + assert_match(/\A\d+\.\d+\s*\z/, ratio_cell) + end + + it 'shows significance symbols and p-value columns with --pvalue' do + executable_names = ['ruby', 'ruby-yjit'] + bench_data = { + 'ruby' => { + 'fib' => { + 'warmup' => [0.1], + 'bench' => [0.100, 0.101, 0.099], + 'rss' => 1024 * 1024 * 10 + } + }, + 'ruby-yjit' => { + 'fib' => { + 'warmup' => [0.05], + 'bench' => [0.050, 0.051, 0.049], + 'rss' => 1024 * 1024 * 12 + } + } + } + + builder = ResultsTableBuilder.new( + executable_names: executable_names, + bench_data: bench_data, + include_pvalue: true + ) + + table, _format = builder.build + assert_includes table[0], 'p-value' + assert_includes table[0], 'sig' + ratio_col_idx = table[0].index('ruby/ruby-yjit') + assert_match(/\(\*{1,3}\)/, table[1][ratio_col_idx]) end it 'handles only headline benchmarks' do