From a9b7c2d9d420ad0e22820c9d6f05a8d80d23491f Mon Sep 17 00:00:00 2001 From: Simar Malhotra Date: Wed, 17 Jun 2026 17:45:16 -0400 Subject: [PATCH 1/8] build(deps): add nokolexbor and rspec, pin ruby 4.0.1 Signed-off-by: Simar Malhotra --- .ruby-version | 1 + Gemfile | 4 ++++ Gemfile.lock | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+) create mode 100644 .ruby-version create mode 100644 Gemfile create mode 100644 Gemfile.lock diff --git a/.ruby-version b/.ruby-version new file mode 100644 index 00000000..1454f6ed --- /dev/null +++ b/.ruby-version @@ -0,0 +1 @@ +4.0.1 diff --git a/Gemfile b/Gemfile new file mode 100644 index 00000000..b6d054a5 --- /dev/null +++ b/Gemfile @@ -0,0 +1,4 @@ +source "https://rubygems.org" + +gem "nokolexbor" +gem "rspec" diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 00000000..64a9676a --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,49 @@ +GEM + remote: https://rubygems.org/ + specs: + diff-lcs (1.6.2) + nokolexbor (0.7.0) + nokolexbor (0.7.0-aarch64-linux) + nokolexbor (0.7.0-arm64-darwin) + nokolexbor (0.7.0-x86_64-darwin) + nokolexbor (0.7.0-x86_64-linux) + rspec (3.13.2) + rspec-core (~> 3.13.0) + rspec-expectations (~> 3.13.0) + rspec-mocks (~> 3.13.0) + rspec-core (3.13.6) + rspec-support (~> 3.13.0) + rspec-expectations (3.13.5) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-mocks (3.13.8) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-support (3.13.7) + +PLATFORMS + aarch64-linux + arm64-darwin + ruby + x86_64-darwin + x86_64-linux + +DEPENDENCIES + nokolexbor + rspec + +CHECKSUMS + diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962 + nokolexbor (0.7.0) sha256=a6df669d9280bfe7f5f334a1734c96b80d8d54ff9b18cea807dbe5651be45dd7 + nokolexbor (0.7.0-aarch64-linux) sha256=1729e1d5e5fb3a5f1328453f4ee884a8c53de3a94ff315cacf518acf8b4e059f + nokolexbor (0.7.0-arm64-darwin) sha256=874c1cae2c2658d0cc4018f6569540753ff03b79bacb1b0d1380a8230a0a14ea + nokolexbor (0.7.0-x86_64-darwin) sha256=5de1b440996839cf82f2f35c79b4e1eee28100a263cdb9e67fa28c016c0526fe + nokolexbor (0.7.0-x86_64-linux) sha256=6348178e41233e67e0f533f84b0b1974b187fe137616541f1453bb7c0c16baf6 + rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587 + rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d + rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836 + rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47 + rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c + +BUNDLED WITH + 4.0.3 From dc629d65bb50d257f82d43386d71b902c4301369 Mon Sep 17 00:00:00 2001 From: Simar Malhotra Date: Wed, 17 Jun 2026 17:47:34 -0400 Subject: [PATCH 2/8] feat(parser): implement structure-based carousel extractor and image resolver Signed-off-by: Simar Malhotra --- lib/carousel_parser.rb | 70 ++++++++++++++++++++++++++++++++++++++++++ lib/image_extractor.rb | 55 +++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 lib/carousel_parser.rb create mode 100644 lib/image_extractor.rb diff --git a/lib/carousel_parser.rb b/lib/carousel_parser.rb new file mode 100644 index 00000000..adbcd9df --- /dev/null +++ b/lib/carousel_parser.rb @@ -0,0 +1,70 @@ +require "nokolexbor" +require_relative "image_extractor" + +# Parses a Google Search HTML page and extracts carousel items as an array of +# { name:, extensions:, link:, image: } hashes. +# +# Detection is structure-based rather than CSS-class-based: a carousel item is +# any whose href contains both "/search?" and "&stick=". This signal is +# part of Google's search URL semantics and is stable across class-name rotations +# and different carousel layouts (paintings, movies, albums, etc.). +class CarouselParser + GOOGLE_BASE = "https://www.google.com" + YEAR_PATTERN = /\A\d{4}\z/ + + def initialize(html) + @doc = Nokolexbor::HTML(html) + @extractor = ImageExtractor.new(@doc) + end + + def parse + carousel_anchors.map { |anchor| extract_item(anchor) } + end + + private + + def carousel_anchors + @doc.css("a").select do |a| + href = a["href"].to_s + href.include?("/search?") && href.include?("stick=") && a.css("img").any? + end + end + + def extract_item(anchor) + labels = leaf_texts(anchor) + img = anchor.css("img").first + + { + "name" => labels[0], + "extensions" => year_extensions(labels[1]), + "link" => normalize_link(anchor["href"]), + "image" => img ? @extractor.for_img(img) : nil + }.tap { |h| h.delete("image") if h["image"].nil? } + end + + # Collects text from leaf nodes (nodes with no element children) under the + # anchor. This avoids duplicating text that appears in nested containers and + # reliably separates name from extension without relying on class names. + def leaf_texts(node) + texts = [] + node.traverse do |child| + next unless child.text? + next if child.parent.element_children.any? + + text = child.text.strip + texts << text unless text.empty? + end + texts + end + + def year_extensions(text) + return nil unless text && text.match?(YEAR_PATTERN) + + [text] + end + + def normalize_link(href) + decoded = href.gsub("&", "&") + decoded.start_with?("/") ? "#{GOOGLE_BASE}#{decoded}" : decoded + end +end diff --git a/lib/image_extractor.rb b/lib/image_extractor.rb new file mode 100644 index 00000000..8e4784bf --- /dev/null +++ b/lib/image_extractor.rb @@ -0,0 +1,55 @@ +require "nokolexbor" + +# Resolves carousel thumbnail images from two sources present in the static HTML: +# 1. Inline

Search Results

Robert De Niro
American actor and director
Google apps
Google Account
Test User
user@example.com
\ No newline at end of file diff --git a/spec/fixtures/shinkai-books.html b/spec/fixtures/shinkai-books.html new file mode 100644 index 00000000..71139da1 --- /dev/null +++ b/spec/fixtures/shinkai-books.html @@ -0,0 +1,41 @@ +makoto shinkai books - Google Search

Search Results

Makoto Shinkai
Japanese filmmaker and novelist
Google apps
Google Account
Test User
user@example.com
\ No newline at end of file From d07b2679eb721cc1b1c2ca57e140f4e2041e2fb7 Mon Sep 17 00:00:00 2001 From: Simar Malhotra Date: Wed, 17 Jun 2026 17:52:44 -0400 Subject: [PATCH 6/8] feat(bin): add parse entry point, outputs carousel JSON to stdout Signed-off-by: Simar Malhotra --- bin/parse | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100755 bin/parse diff --git a/bin/parse b/bin/parse new file mode 100755 index 00000000..3349b133 --- /dev/null +++ b/bin/parse @@ -0,0 +1,21 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require "json" +require_relative "../lib/carousel_parser" + +path = ARGV[0] + +if path.nil? || path == "--help" || path == "-h" + warn "Usage: bin/parse " + warn " bin/parse files/van-gogh-paintings.html" + exit 1 +end + +unless File.exist?(path) + warn "Error: file not found: #{path}" + exit 1 +end + +results = CarouselParser.new(File.read(path)).parse +puts JSON.pretty_generate(results) From 5c3310922cc12978c560ce67a61f51fd22c0e002 Mon Sep 17 00:00:00 2001 From: Simar Malhotra Date: Wed, 17 Jun 2026 17:55:29 -0400 Subject: [PATCH 7/8] fix(bin): add bundler/setup, drop BUNDLED WITH constraint for portability Signed-off-by: Simar Malhotra --- Gemfile.lock | 2 -- bin/parse | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 64a9676a..e55daac5 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -45,5 +45,3 @@ CHECKSUMS rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47 rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c -BUNDLED WITH - 4.0.3 diff --git a/bin/parse b/bin/parse index 3349b133..d83389fc 100755 --- a/bin/parse +++ b/bin/parse @@ -1,6 +1,7 @@ #!/usr/bin/env ruby # frozen_string_literal: true +require "bundler/setup" require "json" require_relative "../lib/carousel_parser" From cfa8d9b1bba6b7af120b7e7dc34e4b0a0ce5c7a1 Mon Sep 17 00:00:00 2001 From: Simar Malhotra Date: Wed, 17 Jun 2026 17:56:38 -0400 Subject: [PATCH 8/8] feat(bin): add setup script with ruby version guard Signed-off-by: Simar Malhotra --- bin/setup | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100755 bin/setup diff --git a/bin/setup b/bin/setup new file mode 100755 index 00000000..40e0f747 --- /dev/null +++ b/bin/setup @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -e + +required_major=3 +ruby_version=$(ruby --version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1) +ruby_major=$(echo "$ruby_version" | cut -d. -f1) + +if [ -z "$ruby_version" ] || [ "$ruby_major" -lt "$required_major" ]; then + echo "Error: Ruby >= $required_major.1 required (found: ${ruby_version:-none})" + echo "Install via Homebrew: brew install ruby" + echo "Then add to your shell: export PATH=\"/opt/homebrew/opt/ruby/bin:\$PATH\"" + exit 1 +fi + +echo "Ruby $ruby_version OK" +bundle install +echo "Setup complete. Run:" +echo " bundle exec bin/parse files/van-gogh-paintings.html" +echo " bundle exec rspec"