From 3ae245e2a20b85c4f5782bb05773e7212e928e31 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Tue, 9 Jun 2026 03:55:45 +0000 Subject: [PATCH 1/5] fix(cli): improve LaTeX and JS-heavy site parsing in direct_fetch Updated strip_html in direct_fetch.rs to: - Skip content in , , and alt attributes (starting with {\displaystyle) in $ delimiters. - Added unit tests to verify these improvements. - Fixed a bug in integration tests where assertions were incorrectly wrapped in tuples. - Formatted code with cargo fmt and ruff. Co-authored-by: d-oit <6849456+d-oit@users.noreply.github.com> --- cli/src/providers/direct_fetch.rs | 32 +++++++++++++++++++++++++++++-- tests/test_routing_foundation.py | 12 ++++++------ 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/cli/src/providers/direct_fetch.rs b/cli/src/providers/direct_fetch.rs index 890f18d..2e76011 100644 --- a/cli/src/providers/direct_fetch.rs +++ b/cli/src/providers/direct_fetch.rs @@ -207,7 +207,12 @@ fn strip_html(html: &str) -> String { .next() .unwrap_or(""); - if tag_name == "script" || tag_name == "style" { + if tag_name == "script" + || tag_name == "style" + || tag_name == "math" + || tag_name == "svg" + || tag_name == "noscript" + { if is_closing { skip_content_depth = skip_content_depth.saturating_sub(1); } else if !tag_lower.trim().ends_with('/') { @@ -251,7 +256,13 @@ fn strip_html(html: &str) -> String { if let Some(alt) = get_attribute(¤t_tag, "alt") { if !alt.is_empty() { result.push(' '); - result.push_str(&alt); + if alt.trim().starts_with("{\\displaystyle") { + result.push('$'); + result.push_str(alt.trim()); + result.push('$'); + } else { + result.push_str(&alt); + } result.push(' '); } } @@ -357,6 +368,23 @@ mod tests { assert!(result.contains("x^2 + y^2 = z^2")); } + #[test] + fn test_img_alt_latex() { + let html = "\"{\\displaystyle"; + let result = strip_html(html); + assert!(result.contains("${\\displaystyle x^2 + y^2 = z^2}$")); + } + + #[test] + fn test_skip_math_svg() { + let html = "
Keep this x
"; + let result = strip_html(html); + assert!(result.contains("Keep this")); + assert!(!result.contains("")); + assert!(!result.contains("x")); + assert!(!result.contains(" 0.7, f"Expected avg_quality > 0.7, got {stats['avg_quality']}" From 26b9f705e9ea43d112de8b755890f8086ff541e8 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Tue, 9 Jun 2026 04:03:51 +0000 Subject: [PATCH 2/5] fix(cli): improve LaTeX parsing and fix integration test assertions This commit improves the HTML-to-Markdown conversion in the Rust CLI and fixes a critical bug in the integration test suite. Changes: - Updated `strip_html` in `direct_fetch.rs` to skip noise from ``, ``, and `
 block but haven't found a language hint yet, check the  tag
-                            if let Some(lang) = get_attribute(¤t_tag, "class")
-                                .and_then(|c| parse_language_hint(&c))
-                            {
-                                // Backtrack to inject the language hint if the current block is empty of content
-                                if result.ends_with("\n```\n") {
-                                    result.truncate(result.len() - 1); // Remove trailing newline
-                                    result.push_str(&lang);
-                                    result.push('\n');
-                                    current_pre_lang = lang;
-                                }
-                            }
-                        }
-                    } else if tag_name == "pre" {
-                        in_pre = true;
-                        current_pre_lang = get_attribute(¤t_tag, "class")
-                            .and_then(|c| parse_language_hint(&c))
-                            .unwrap_or_default();
-                        result.push_str("\n```");
-                        result.push_str(¤t_pre_lang);
-                        result.push('\n');
-                    } else if tag_name == "img" {
-                        if let Some(alt) = get_attribute(¤t_tag, "alt") {
-                            if !alt.is_empty() {
-                                result.push(' ');
-                                if alt.trim().starts_with("{\\displaystyle") {
-                                    result.push('$');
-                                    result.push_str(alt.trim());
-                                    result.push('$');
-                                } else {
-                                    result.push_str(&alt);
-                                }
-                                result.push(' ');
-                            }
-                        }
-                    }
-                } else {
-                    // Closing tags
-                    if tag_name == "code" {
-                        if !in_pre {
-                            result.push('`');
-                        }
-                    } else if tag_name == "pre" {
-                        in_pre = false;
-                        if !result.ends_with('\n') {
-                            result.push('\n');
-                        }
-                        result.push_str("```\n");
-                    } else if block_tags.contains(tag_name)
-                        && !result.is_empty()
-                        && !result.ends_with('\n')
-                    {
-                        result.push('\n');
-                    }
-                }
-            }
+            state.handle_tag(¤t_tag);
         } else if in_tag {
             current_tag.push(ch);
-        } else if skip_content_depth == 0 {
-            result.push(ch);
+        } else if state.skip_content_depth == 0 {
+            state.result.push(ch);
         }
     }
 
-    let decoded = decode_entities(&result);
+    let decoded = decode_entities(&state.result);
 
     // Clean up whitespace
     let mut final_result = String::new();

From 868a9d7cf94de200a9eb8a8beb4ab1723da12ee4 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Tue, 9 Jun 2026 08:21:14 +0000
Subject: [PATCH 4/5] fix(cli): elide needless lifetimes in StripperState impl

- Addresses Clippy error `clippy::needless_lifetimes` by eliding the
  explicit `'a` lifetime in the `impl` block for `StripperState`.
- This ensures CI checks pass for the Rust CLI.
- Mainloop and parsing logic remain unchanged and verified by tests.

Co-authored-by: d-oit <6849456+d-oit@users.noreply.github.com>
---
 cli/src/providers/direct_fetch.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cli/src/providers/direct_fetch.rs b/cli/src/providers/direct_fetch.rs
index 73b1c99..44d10d8 100644
--- a/cli/src/providers/direct_fetch.rs
+++ b/cli/src/providers/direct_fetch.rs
@@ -166,7 +166,7 @@ struct StripperState<'a> {
     block_tags: HashSet<&'a str>,
 }
 
-impl<'a> StripperState<'a> {
+impl StripperState<'_> {
     fn new() -> Self {
         let block_tags = [
             "p",

From 7d7e0986c3fb3753f7c9d9292a4fe1598c443ae8 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Tue, 9 Jun 2026 08:29:29 +0000
Subject: [PATCH 5/5] fix(cli): resolve CI lint and formatting issues

- Fixed Clippy error `clippy::needless_lifetimes` in `StripperState`.
- Applied `cargo fmt` to satisfy Rust formatting requirements.
- Verified all tests and lints pass locally.
- This commit updates the existing PR branch to resolve CI failures.

Co-authored-by: d-oit <6849456+d-oit@users.noreply.github.com>
---
 cli/src/providers/direct_fetch.rs | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/cli/src/providers/direct_fetch.rs b/cli/src/providers/direct_fetch.rs
index 44d10d8..c3dd640 100644
--- a/cli/src/providers/direct_fetch.rs
+++ b/cli/src/providers/direct_fetch.rs
@@ -213,10 +213,7 @@ impl StripperState<'_> {
             .next()
             .unwrap_or("");
 
-        if matches!(
-            tag_name,
-            "script" | "style" | "math" | "svg" | "noscript"
-        ) {
+        if matches!(tag_name, "script" | "style" | "math" | "svg" | "noscript") {
             if is_closing {
                 self.skip_content_depth = self.skip_content_depth.saturating_sub(1);
             } else if !tag_lower.trim().ends_with('/') {