serpapi · Simar-malhotra09 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/.rspec b/.rspec
@@ -0,0 +1 @@
+--require spec_helper
diff --git a/.ruby-version b/.ruby-version
@@ -0,0 +1 @@
+4.0.1
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,4 @@
+source "https://rubygems.org"
+
+gem "nokolexbor"
+gem "rspec"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -0,0 +1,47 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    diff-lcs (1.6.2)
+    nokolexbor (0.7.0)
+    nokolexbor (0.7.0-aarch64-linux)
+    nokolexbor (0.7.0-arm64-darwin)
+    nokolexbor (0.7.0-x86_64-darwin)
+    nokolexbor (0.7.0-x86_64-linux)
+    rspec (3.13.2)
+      rspec-core (~> 3.13.0)
+      rspec-expectations (~> 3.13.0)
+      rspec-mocks (~> 3.13.0)
+    rspec-core (3.13.6)
+      rspec-support (~> 3.13.0)
+    rspec-expectations (3.13.5)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-mocks (3.13.8)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-support (3.13.7)
+
+PLATFORMS
+  aarch64-linux
+  arm64-darwin
+  ruby
+  x86_64-darwin
+  x86_64-linux
+
+DEPENDENCIES
+  nokolexbor
+  rspec
+
+CHECKSUMS
+  diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
+  nokolexbor (0.7.0) sha256=a6df669d9280bfe7f5f334a1734c96b80d8d54ff9b18cea807dbe5651be45dd7
+  nokolexbor (0.7.0-aarch64-linux) sha256=1729e1d5e5fb3a5f1328453f4ee884a8c53de3a94ff315cacf518acf8b4e059f
+  nokolexbor (0.7.0-arm64-darwin) sha256=874c1cae2c2658d0cc4018f6569540753ff03b79bacb1b0d1380a8230a0a14ea
+  nokolexbor (0.7.0-x86_64-darwin) sha256=5de1b440996839cf82f2f35c79b4e1eee28100a263cdb9e67fa28c016c0526fe
+  nokolexbor (0.7.0-x86_64-linux) sha256=6348178e41233e67e0f533f84b0b1974b187fe137616541f1453bb7c0c16baf6
+  rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
+  rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
+  rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
+  rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
+  rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
+
diff --git a/bin/parse b/bin/parse
@@ -0,0 +1,22 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require "bundler/setup"
+require "json"
+require_relative "../lib/carousel_parser"
+
+path = ARGV[0]
+
+if path.nil? || path == "--help" || path == "-h"
+  warn "Usage: bin/parse <path-to-html-file>"
+  warn "       bin/parse files/van-gogh-paintings.html"
+  exit 1
+end
+
+unless File.exist?(path)
+  warn "Error: file not found: #{path}"
+  exit 1
+end
+
+results = CarouselParser.new(File.read(path)).parse
+puts JSON.pretty_generate(results)
diff --git a/bin/setup b/bin/setup
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+set -e
+
+required_major=3
+ruby_version=$(ruby --version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1)
+ruby_major=$(echo "$ruby_version" | cut -d. -f1)
+
+if [ -z "$ruby_version" ] || [ "$ruby_major" -lt "$required_major" ]; then
+  echo "Error: Ruby >= $required_major.1 required (found: ${ruby_version:-none})"
+  echo "Install via Homebrew: brew install ruby"
+  echo "Then add to your shell: export PATH=\"/opt/homebrew/opt/ruby/bin:\$PATH\""
+  exit 1
+fi
+
+echo "Ruby $ruby_version OK"
+bundle install
+echo "Setup complete. Run:"
+echo "  bundle exec bin/parse files/van-gogh-paintings.html"
+echo "  bundle exec rspec"
diff --git a/lib/carousel_parser.rb b/lib/carousel_parser.rb
@@ -0,0 +1,71 @@
+require "nokolexbor"
+require_relative "image_extractor"
+
+# Parses a Google Search HTML page and extracts carousel items as an array of
+# { name:, extensions:, link:, image: } hashes.
+#
+# Detection is structure-based rather than CSS-class-based: a carousel item is
+# any <a> whose href contains both "/search?" and "&stick=". This signal is
+# part of Google's search URL semantics and is stable across class-name rotations
+# and different carousel layouts (paintings, movies, albums, etc.).
+class CarouselParser
+  GOOGLE_BASE = "https://www.google.com"
+  YEAR_PATTERN = /\A\d{4}\z/
+
+  def initialize(html)
+    @doc       = Nokolexbor::HTML(html)
+    @extractor = ImageExtractor.new(@doc)
+  end
+
+  def parse
+    carousel_anchors.map { |anchor| extract_item(anchor) }
+  end
+
+  private
+
+  def carousel_anchors
+    @doc.css("a").select do |a|
+      href = a["href"].to_s
+      href.include?("/search?") && href.include?("stick=") && a.css("img").any?
+    end
+  end
+
+  def extract_item(anchor)
+    labels = leaf_texts(anchor)
+    img    = anchor.css("img").first
+
+    {
+      "name"       => labels[0],
+      "extensions" => year_extensions(labels[1]),
+      "link"       => normalize_link(anchor["href"]),
+      "image"      => img ? @extractor.for_img(img) : nil
+    }.tap { |h| h.delete("extensions") if h["extensions"].nil? }
+     .tap { |h| h.delete("image")      if h["image"].nil? }
+  end
+
+  # Collects text from leaf nodes (nodes with no element children) under the
+  # anchor. This avoids duplicating text that appears in nested containers and
+  # reliably separates name from extension without relying on class names.
+  def leaf_texts(node)
+    texts = []
+    node.traverse do |child|
+      next unless child.text?
+      next if child.parent.element_children.any?
+
+      text = child.text.strip
+      texts << text unless text.empty?
+    end
+    texts
+  end
+
+  def year_extensions(text)
+    return nil unless text && text.match?(YEAR_PATTERN)
+
+    [text]
+  end
+
+  def normalize_link(href)
+    decoded = href.gsub("&amp;", "&")
+    decoded.start_with?("/") ? "#{GOOGLE_BASE}#{decoded}" : decoded
+  end
+end
diff --git a/lib/image_extractor.rb b/lib/image_extractor.rb
@@ -0,0 +1,55 @@
+require "nokolexbor"
+
+# Resolves carousel thumbnail images from two sources present in the static HTML:
+#   1. Inline <script> blocks that call _setImagesSrc(ii, s) — used for the
+#      initially-visible items. Parses id → base64 data URI mappings once on init.
+#   2. data-src attributes on <img> tags — used for lazy-loaded items.
+class ImageExtractor
+  def initialize(doc)
+    @id_map = build_id_map(doc)
+  end
+
+  def for_img(img_node)
+    id = img_node["id"]
+    return @id_map[id] if id && @id_map.key?(id)
+
+    img_node["data-src"]
+  end
+
+  private
+
+  def build_id_map(doc)
+    map = {}
+    doc.css("script").each do |script|
+      src = script.text
+      next unless src.include?("_setImagesSrc") && src.include?("data:image")
+
+      image_uri = extract_image_uri(src)
+      ids       = extract_ids(src)
+
+      next unless image_uri && ids.any?
+
+      ids.each { |id| map[id] = image_uri }
+    end
+    map
+  end
+
+  def extract_image_uri(src)
+    match = src.match(/(?:var|let|const)\s+s\s*=\s*'(data:image[^']*)'/)
+    return nil unless match
+
+    unescape_js(match[1])
+  end
+
+  def extract_ids(src)
+    match = src.match(/(?:var|let|const)\s+ii\s*=\s*\[([^\]]+)\]/)
+    return [] unless match
+
+    match[1].scan(/'([^']+)'/).flatten
+  end
+
+  # Google encodes some base64 padding chars as \x3d (=) and slashes as \/
+  def unescape_js(str)
+    str.gsub(/\\x([0-9a-fA-F]{2})/) { $1.to_i(16).chr }.gsub('\/', "/")
+  end
+end
diff --git a/spec/carousel_parser_spec.rb b/spec/carousel_parser_spec.rb
@@ -0,0 +1,108 @@
+require "json"
+require_relative "../lib/carousel_parser"
+
+FIXTURES = File.expand_path("../files", __dir__)
+
+RSpec.describe CarouselParser do
+  describe "Van Gogh paintings (primary fixture)" do
+    let(:html)     { File.read("#{FIXTURES}/van-gogh-paintings.html") }
+    let(:expected) { JSON.parse(File.read("#{FIXTURES}/expected-array.json"))["artworks"] }
+    let(:results)  { described_class.new(html).parse }
+
+    it "extracts the correct number of artworks" do
+      expect(results.length).to eq(expected.length)
+    end
+
+    it "matches names in order" do
+      expect(results.map { |r| r["name"] }).to eq(expected.map { |e| e["name"] })
+    end
+
+    it "matches extensions (years) in order" do
+      expect(results.map { |r| r["extensions"] }).to eq(expected.map { |e| e["extensions"] })
+    end
+
+    it "matches links in order" do
+      expect(results.map { |r| r["link"] }).to eq(expected.map { |e| e["link"] })
+    end
+
+    it "matches images in order" do
+      expect(results.map { |r| r["image"] }).to eq(expected.map { |e| e["image"] })
+    end
+
+    it "produces an exact match against the full expected array" do
+      expect(results).to eq(expected)
+    end
+  end
+
+  describe "De Niro movies carousel" do
+    let(:results) { described_class.new(File.read("#{FIXTURES}/../spec/fixtures/deniro-movies.html")).parse }
+
+    it "extracts 12 movies" do
+      expect(results.length).to eq(12)
+    end
+
+    it "first item is Taxi Driver (1976)" do
+      expect(results.first["name"]).to eq("Taxi Driver")
+      expect(results.first["extensions"]).to eq(["1976"])
+    end
+
+    it "every item has a name, link, and image" do
+      results.each do |r|
+        expect(r["name"]).to be_a(String)
+        expect(r["link"]).to start_with("https://www.google.com")
+        expect(r["image"]).not_to be_nil
+      end
+    end
+  end
+
+  describe "Shinkai books carousel" do
+    let(:results) { described_class.new(File.read("#{FIXTURES}/../spec/fixtures/shinkai-books.html")).parse }
+
+    it "extracts 12 books" do
+      expect(results.length).to eq(12)
+    end
+
+    it "first item is Your Name (2016)" do
+      expect(results.first["name"]).to eq("Your Name")
+      expect(results.first["extensions"]).to eq(["2016"])
+    end
+
+    it "every item has a name, link, and image" do
+      results.each do |r|
+        expect(r["name"]).to be_a(String)
+        expect(r["link"]).to start_with("https://www.google.com")
+        expect(r["image"]).not_to be_nil
+      end
+    end
+  end
+
+  describe "item structure" do
+    let(:html)    { File.read("#{FIXTURES}/van-gogh-paintings.html") }
+    let(:results) { described_class.new(html).parse }
+
+    it "every item has a name string" do
+      results.each { |r| expect(r["name"]).to be_a(String) }
+    end
+
+    it "every item has a link starting with https://www.google.com" do
+      results.each { |r| expect(r["link"]).to start_with("https://www.google.com") }
+    end
+
+    it "extensions, when present, contain a four-digit year string" do
+      results.each do |r|
+        next unless r["extensions"]
+
+        expect(r["extensions"]).to be_an(Array)
+        r["extensions"].each { |ext| expect(ext).to match(/\A\d{4}\z/) }
+      end
+    end
+
+    it "images, when present, are data URIs or URLs" do
+      results.each do |r|
+        next unless r["image"]
+
+        expect(r["image"]).to match(/\Adata:image\/|https:\/\//)
+      end
+    end
+  end
+end