diff --git a/.rspec b/.rspec new file mode 100644 index 00000000..c99d2e73 --- /dev/null +++ b/.rspec @@ -0,0 +1 @@ +--require spec_helper diff --git a/.ruby-version b/.ruby-version new file mode 100644 index 00000000..1454f6ed --- /dev/null +++ b/.ruby-version @@ -0,0 +1 @@ +4.0.1 diff --git a/Gemfile b/Gemfile new file mode 100644 index 00000000..b6d054a5 --- /dev/null +++ b/Gemfile @@ -0,0 +1,4 @@ +source "https://rubygems.org" + +gem "nokolexbor" +gem "rspec" diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 00000000..e55daac5 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,47 @@ +GEM + remote: https://rubygems.org/ + specs: + diff-lcs (1.6.2) + nokolexbor (0.7.0) + nokolexbor (0.7.0-aarch64-linux) + nokolexbor (0.7.0-arm64-darwin) + nokolexbor (0.7.0-x86_64-darwin) + nokolexbor (0.7.0-x86_64-linux) + rspec (3.13.2) + rspec-core (~> 3.13.0) + rspec-expectations (~> 3.13.0) + rspec-mocks (~> 3.13.0) + rspec-core (3.13.6) + rspec-support (~> 3.13.0) + rspec-expectations (3.13.5) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-mocks (3.13.8) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-support (3.13.7) + +PLATFORMS + aarch64-linux + arm64-darwin + ruby + x86_64-darwin + x86_64-linux + +DEPENDENCIES + nokolexbor + rspec + +CHECKSUMS + diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962 + nokolexbor (0.7.0) sha256=a6df669d9280bfe7f5f334a1734c96b80d8d54ff9b18cea807dbe5651be45dd7 + nokolexbor (0.7.0-aarch64-linux) sha256=1729e1d5e5fb3a5f1328453f4ee884a8c53de3a94ff315cacf518acf8b4e059f + nokolexbor (0.7.0-arm64-darwin) sha256=874c1cae2c2658d0cc4018f6569540753ff03b79bacb1b0d1380a8230a0a14ea + nokolexbor (0.7.0-x86_64-darwin) sha256=5de1b440996839cf82f2f35c79b4e1eee28100a263cdb9e67fa28c016c0526fe + nokolexbor (0.7.0-x86_64-linux) sha256=6348178e41233e67e0f533f84b0b1974b187fe137616541f1453bb7c0c16baf6 + rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587 + rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d + rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836 + rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47 + rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c + diff --git a/bin/parse b/bin/parse new file mode 100755 index 00000000..d83389fc --- /dev/null +++ b/bin/parse @@ -0,0 +1,22 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require "bundler/setup" +require "json" +require_relative "../lib/carousel_parser" + +path = ARGV[0] + +if path.nil? || path == "--help" || path == "-h" + warn "Usage: bin/parse " + warn " bin/parse files/van-gogh-paintings.html" + exit 1 +end + +unless File.exist?(path) + warn "Error: file not found: #{path}" + exit 1 +end + +results = CarouselParser.new(File.read(path)).parse +puts JSON.pretty_generate(results) diff --git a/bin/setup b/bin/setup new file mode 100755 index 00000000..40e0f747 --- /dev/null +++ b/bin/setup @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -e + +required_major=3 +ruby_version=$(ruby --version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1) +ruby_major=$(echo "$ruby_version" | cut -d. -f1) + +if [ -z "$ruby_version" ] || [ "$ruby_major" -lt "$required_major" ]; then + echo "Error: Ruby >= $required_major.1 required (found: ${ruby_version:-none})" + echo "Install via Homebrew: brew install ruby" + echo "Then add to your shell: export PATH=\"/opt/homebrew/opt/ruby/bin:\$PATH\"" + exit 1 +fi + +echo "Ruby $ruby_version OK" +bundle install +echo "Setup complete. Run:" +echo " bundle exec bin/parse files/van-gogh-paintings.html" +echo " bundle exec rspec" diff --git a/lib/carousel_parser.rb b/lib/carousel_parser.rb new file mode 100644 index 00000000..9752c774 --- /dev/null +++ b/lib/carousel_parser.rb @@ -0,0 +1,71 @@ +require "nokolexbor" +require_relative "image_extractor" + +# Parses a Google Search HTML page and extracts carousel items as an array of +# { name:, extensions:, link:, image: } hashes. +# +# Detection is structure-based rather than CSS-class-based: a carousel item is +# any whose href contains both "/search?" and "&stick=". This signal is +# part of Google's search URL semantics and is stable across class-name rotations +# and different carousel layouts (paintings, movies, albums, etc.). +class CarouselParser + GOOGLE_BASE = "https://www.google.com" + YEAR_PATTERN = /\A\d{4}\z/ + + def initialize(html) + @doc = Nokolexbor::HTML(html) + @extractor = ImageExtractor.new(@doc) + end + + def parse + carousel_anchors.map { |anchor| extract_item(anchor) } + end + + private + + def carousel_anchors + @doc.css("a").select do |a| + href = a["href"].to_s + href.include?("/search?") && href.include?("stick=") && a.css("img").any? + end + end + + def extract_item(anchor) + labels = leaf_texts(anchor) + img = anchor.css("img").first + + { + "name" => labels[0], + "extensions" => year_extensions(labels[1]), + "link" => normalize_link(anchor["href"]), + "image" => img ? @extractor.for_img(img) : nil + }.tap { |h| h.delete("extensions") if h["extensions"].nil? } + .tap { |h| h.delete("image") if h["image"].nil? } + end + + # Collects text from leaf nodes (nodes with no element children) under the + # anchor. This avoids duplicating text that appears in nested containers and + # reliably separates name from extension without relying on class names. + def leaf_texts(node) + texts = [] + node.traverse do |child| + next unless child.text? + next if child.parent.element_children.any? + + text = child.text.strip + texts << text unless text.empty? + end + texts + end + + def year_extensions(text) + return nil unless text && text.match?(YEAR_PATTERN) + + [text] + end + + def normalize_link(href) + decoded = href.gsub("&", "&") + decoded.start_with?("/") ? "#{GOOGLE_BASE}#{decoded}" : decoded + end +end diff --git a/lib/image_extractor.rb b/lib/image_extractor.rb new file mode 100644 index 00000000..8e4784bf --- /dev/null +++ b/lib/image_extractor.rb @@ -0,0 +1,55 @@ +require "nokolexbor" + +# Resolves carousel thumbnail images from two sources present in the static HTML: +# 1. Inline

Search Results

Robert De Niro
American actor and director
Google apps
Google Account
Test User
user@example.com
\ No newline at end of file diff --git a/spec/fixtures/shinkai-books.html b/spec/fixtures/shinkai-books.html new file mode 100644 index 00000000..71139da1 --- /dev/null +++ b/spec/fixtures/shinkai-books.html @@ -0,0 +1,41 @@ +makoto shinkai books - Google Search

Search Results

Makoto Shinkai
Japanese filmmaker and novelist
Google apps
Google Account
Test User
user@example.com
\ No newline at end of file diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 00000000..c80d44b9 --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,98 @@ +# This file was generated by the `rspec --init` command. Conventionally, all +# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. +# The generated `.rspec` file contains `--require spec_helper` which will cause +# this file to always be loaded, without a need to explicitly require it in any +# files. +# +# Given that it is always loaded, you are encouraged to keep this file as +# light-weight as possible. Requiring heavyweight dependencies from this file +# will add to the boot time of your test suite on EVERY test run, even for an +# individual file that may not need all of that loaded. Instead, consider making +# a separate helper file that requires the additional dependencies and performs +# the additional setup, and require it from the spec files that actually need +# it. +# +# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration +RSpec.configure do |config| + # rspec-expectations config goes here. You can use an alternate + # assertion/expectation library such as wrong or the stdlib/minitest + # assertions if you prefer. + config.expect_with :rspec do |expectations| + # This option will default to `true` in RSpec 4. It makes the `description` + # and `failure_message` of custom matchers include text for helper methods + # defined using `chain`, e.g.: + # be_bigger_than(2).and_smaller_than(4).description + # # => "be bigger than 2 and smaller than 4" + # ...rather than: + # # => "be bigger than 2" + expectations.include_chain_clauses_in_custom_matcher_descriptions = true + end + + # rspec-mocks config goes here. You can use an alternate test double + # library (such as bogus or mocha) by changing the `mock_with` option here. + config.mock_with :rspec do |mocks| + # Prevents you from mocking or stubbing a method that does not exist on + # a real object. This is generally recommended, and will default to + # `true` in RSpec 4. + mocks.verify_partial_doubles = true + end + + # This option will default to `:apply_to_host_groups` in RSpec 4 (and will + # have no way to turn it off -- the option exists only for backwards + # compatibility in RSpec 3). It causes shared context metadata to be + # inherited by the metadata hash of host groups and examples, rather than + # triggering implicit auto-inclusion in groups with matching metadata. + config.shared_context_metadata_behavior = :apply_to_host_groups + +# The settings below are suggested to provide a good initial experience +# with RSpec, but feel free to customize to your heart's content. +=begin + # This allows you to limit a spec run to individual examples or groups + # you care about by tagging them with `:focus` metadata. When nothing + # is tagged with `:focus`, all examples get run. RSpec also provides + # aliases for `it`, `describe`, and `context` that include `:focus` + # metadata: `fit`, `fdescribe` and `fcontext`, respectively. + config.filter_run_when_matching :focus + + # Allows RSpec to persist some state between runs in order to support + # the `--only-failures` and `--next-failure` CLI options. We recommend + # you configure your source control system to ignore this file. + config.example_status_persistence_file_path = "spec/examples.txt" + + # Limits the available syntax to the non-monkey patched syntax that is + # recommended. For more details, see: + # https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/ + config.disable_monkey_patching! + + # This setting enables warnings. It's recommended, but in some cases may + # be too noisy due to issues in dependencies. + config.warnings = true + + # Many RSpec users commonly either run the entire suite or an individual + # file, and it's useful to allow more verbose output when running an + # individual spec file. + if config.files_to_run.one? + # Use the documentation formatter for detailed output, + # unless a formatter has already been configured + # (e.g. via a command-line flag). + config.default_formatter = "doc" + end + + # Print the 10 slowest examples and example groups at the + # end of the spec run, to help surface which specs are running + # particularly slow. + config.profile_examples = 10 + + # Run specs in random order to surface order dependencies. If you find an + # order dependency and want to debug it, you can fix the order by providing + # the seed, which is printed after each run. + # --seed 1234 + config.order = :random + + # Seed global randomization in this process using the `--seed` CLI option. + # Setting this allows you to use `--seed` to deterministically reproduce + # test failures related to randomization by passing the same `--seed` value + # as the one that triggered the failure. + Kernel.srand config.seed +=end +end