Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .rspec
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
--require spec_helper
1 change: 1 addition & 0 deletions .ruby-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
4.0.1
4 changes: 4 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
source "https://rubygems.org"

gem "nokolexbor"
gem "rspec"
47 changes: 47 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
GEM
remote: https://rubygems.org/
specs:
diff-lcs (1.6.2)
nokolexbor (0.7.0)
nokolexbor (0.7.0-aarch64-linux)
nokolexbor (0.7.0-arm64-darwin)
nokolexbor (0.7.0-x86_64-darwin)
nokolexbor (0.7.0-x86_64-linux)
rspec (3.13.2)
rspec-core (~> 3.13.0)
rspec-expectations (~> 3.13.0)
rspec-mocks (~> 3.13.0)
rspec-core (3.13.6)
rspec-support (~> 3.13.0)
rspec-expectations (3.13.5)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-mocks (3.13.8)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-support (3.13.7)

PLATFORMS
aarch64-linux
arm64-darwin
ruby
x86_64-darwin
x86_64-linux

DEPENDENCIES
nokolexbor
rspec

CHECKSUMS
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
nokolexbor (0.7.0) sha256=a6df669d9280bfe7f5f334a1734c96b80d8d54ff9b18cea807dbe5651be45dd7
nokolexbor (0.7.0-aarch64-linux) sha256=1729e1d5e5fb3a5f1328453f4ee884a8c53de3a94ff315cacf518acf8b4e059f
nokolexbor (0.7.0-arm64-darwin) sha256=874c1cae2c2658d0cc4018f6569540753ff03b79bacb1b0d1380a8230a0a14ea
nokolexbor (0.7.0-x86_64-darwin) sha256=5de1b440996839cf82f2f35c79b4e1eee28100a263cdb9e67fa28c016c0526fe
nokolexbor (0.7.0-x86_64-linux) sha256=6348178e41233e67e0f533f84b0b1974b187fe137616541f1453bb7c0c16baf6
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c

22 changes: 22 additions & 0 deletions bin/parse
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env ruby
# frozen_string_literal: true

require "bundler/setup"
require "json"
require_relative "../lib/carousel_parser"

path = ARGV[0]

if path.nil? || path == "--help" || path == "-h"
warn "Usage: bin/parse <path-to-html-file>"
warn " bin/parse files/van-gogh-paintings.html"
exit 1
end

unless File.exist?(path)
warn "Error: file not found: #{path}"
exit 1
end

results = CarouselParser.new(File.read(path)).parse
puts JSON.pretty_generate(results)
19 changes: 19 additions & 0 deletions bin/setup
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash
set -e

required_major=3
ruby_version=$(ruby --version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1)
ruby_major=$(echo "$ruby_version" | cut -d. -f1)

if [ -z "$ruby_version" ] || [ "$ruby_major" -lt "$required_major" ]; then
echo "Error: Ruby >= $required_major.1 required (found: ${ruby_version:-none})"
echo "Install via Homebrew: brew install ruby"
echo "Then add to your shell: export PATH=\"/opt/homebrew/opt/ruby/bin:\$PATH\""
exit 1
fi

echo "Ruby $ruby_version OK"
bundle install
echo "Setup complete. Run:"
echo " bundle exec bin/parse files/van-gogh-paintings.html"
echo " bundle exec rspec"
71 changes: 71 additions & 0 deletions lib/carousel_parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
require "nokolexbor"
require_relative "image_extractor"

# Parses a Google Search HTML page and extracts carousel items as an array of
# { name:, extensions:, link:, image: } hashes.
#
# Detection is structure-based rather than CSS-class-based: a carousel item is
# any <a> whose href contains both "/search?" and "&stick=". This signal is
# part of Google's search URL semantics and is stable across class-name rotations
# and different carousel layouts (paintings, movies, albums, etc.).
class CarouselParser
GOOGLE_BASE = "https://www.google.com"
YEAR_PATTERN = /\A\d{4}\z/

def initialize(html)
@doc = Nokolexbor::HTML(html)
@extractor = ImageExtractor.new(@doc)
end

def parse
carousel_anchors.map { |anchor| extract_item(anchor) }
end

private

def carousel_anchors
@doc.css("a").select do |a|
href = a["href"].to_s
href.include?("/search?") && href.include?("stick=") && a.css("img").any?
end
end

def extract_item(anchor)
labels = leaf_texts(anchor)
img = anchor.css("img").first

{
"name" => labels[0],
"extensions" => year_extensions(labels[1]),
"link" => normalize_link(anchor["href"]),
"image" => img ? @extractor.for_img(img) : nil
}.tap { |h| h.delete("extensions") if h["extensions"].nil? }
.tap { |h| h.delete("image") if h["image"].nil? }
end

# Collects text from leaf nodes (nodes with no element children) under the
# anchor. This avoids duplicating text that appears in nested containers and
# reliably separates name from extension without relying on class names.
def leaf_texts(node)
texts = []
node.traverse do |child|
next unless child.text?
next if child.parent.element_children.any?

text = child.text.strip
texts << text unless text.empty?
end
texts
end

def year_extensions(text)
return nil unless text && text.match?(YEAR_PATTERN)

[text]
end

def normalize_link(href)
decoded = href.gsub("&amp;", "&")
decoded.start_with?("/") ? "#{GOOGLE_BASE}#{decoded}" : decoded
end
end
55 changes: 55 additions & 0 deletions lib/image_extractor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
require "nokolexbor"

# Resolves carousel thumbnail images from two sources present in the static HTML:
# 1. Inline <script> blocks that call _setImagesSrc(ii, s) — used for the
# initially-visible items. Parses id → base64 data URI mappings once on init.
# 2. data-src attributes on <img> tags — used for lazy-loaded items.
class ImageExtractor
def initialize(doc)
@id_map = build_id_map(doc)
end

def for_img(img_node)
id = img_node["id"]
return @id_map[id] if id && @id_map.key?(id)

img_node["data-src"]
end

private

def build_id_map(doc)
map = {}
doc.css("script").each do |script|
src = script.text
next unless src.include?("_setImagesSrc") && src.include?("data:image")

image_uri = extract_image_uri(src)
ids = extract_ids(src)

next unless image_uri && ids.any?

ids.each { |id| map[id] = image_uri }
end
map
end

def extract_image_uri(src)
match = src.match(/(?:var|let|const)\s+s\s*=\s*'(data:image[^']*)'/)
return nil unless match

unescape_js(match[1])
end

def extract_ids(src)
match = src.match(/(?:var|let|const)\s+ii\s*=\s*\[([^\]]+)\]/)
return [] unless match

match[1].scan(/'([^']+)'/).flatten
end

# Google encodes some base64 padding chars as \x3d (=) and slashes as \/
def unescape_js(str)
str.gsub(/\\x([0-9a-fA-F]{2})/) { $1.to_i(16).chr }.gsub('\/', "/")
end
end
108 changes: 108 additions & 0 deletions spec/carousel_parser_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
require "json"
require_relative "../lib/carousel_parser"

FIXTURES = File.expand_path("../files", __dir__)

RSpec.describe CarouselParser do
describe "Van Gogh paintings (primary fixture)" do
let(:html) { File.read("#{FIXTURES}/van-gogh-paintings.html") }
let(:expected) { JSON.parse(File.read("#{FIXTURES}/expected-array.json"))["artworks"] }
let(:results) { described_class.new(html).parse }

it "extracts the correct number of artworks" do
expect(results.length).to eq(expected.length)
end

it "matches names in order" do
expect(results.map { |r| r["name"] }).to eq(expected.map { |e| e["name"] })
end

it "matches extensions (years) in order" do
expect(results.map { |r| r["extensions"] }).to eq(expected.map { |e| e["extensions"] })
end

it "matches links in order" do
expect(results.map { |r| r["link"] }).to eq(expected.map { |e| e["link"] })
end

it "matches images in order" do
expect(results.map { |r| r["image"] }).to eq(expected.map { |e| e["image"] })
end

it "produces an exact match against the full expected array" do
expect(results).to eq(expected)
end
end

describe "De Niro movies carousel" do
let(:results) { described_class.new(File.read("#{FIXTURES}/../spec/fixtures/deniro-movies.html")).parse }

it "extracts 12 movies" do
expect(results.length).to eq(12)
end

it "first item is Taxi Driver (1976)" do
expect(results.first["name"]).to eq("Taxi Driver")
expect(results.first["extensions"]).to eq(["1976"])
end

it "every item has a name, link, and image" do
results.each do |r|
expect(r["name"]).to be_a(String)
expect(r["link"]).to start_with("https://www.google.com")
expect(r["image"]).not_to be_nil
end
end
end

describe "Shinkai books carousel" do
let(:results) { described_class.new(File.read("#{FIXTURES}/../spec/fixtures/shinkai-books.html")).parse }

it "extracts 12 books" do
expect(results.length).to eq(12)
end

it "first item is Your Name (2016)" do
expect(results.first["name"]).to eq("Your Name")
expect(results.first["extensions"]).to eq(["2016"])
end

it "every item has a name, link, and image" do
results.each do |r|
expect(r["name"]).to be_a(String)
expect(r["link"]).to start_with("https://www.google.com")
expect(r["image"]).not_to be_nil
end
end
end

describe "item structure" do
let(:html) { File.read("#{FIXTURES}/van-gogh-paintings.html") }
let(:results) { described_class.new(html).parse }

it "every item has a name string" do
results.each { |r| expect(r["name"]).to be_a(String) }
end

it "every item has a link starting with https://www.google.com" do
results.each { |r| expect(r["link"]).to start_with("https://www.google.com") }
end

it "extensions, when present, contain a four-digit year string" do
results.each do |r|
next unless r["extensions"]

expect(r["extensions"]).to be_an(Array)
r["extensions"].each { |ext| expect(ext).to match(/\A\d{4}\z/) }
end
end

it "images, when present, are data URIs or URLs" do
results.each do |r|
next unless r["image"]

expect(r["image"]).to match(/\Adata:image\/|https:\/\//)
end
end
end
end
Loading