serpapi · Ed-Lovera · Jun 23, 2026
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,4 @@
+source "https://rubygems.org"
+
+gem "nokolexbor"
+gem "rspec"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -0,0 +1,28 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    diff-lcs (1.6.2)
+    nokolexbor (0.7.0-arm64-darwin)
+    rspec (3.13.2)
+      rspec-core (~> 3.13.0)
+      rspec-expectations (~> 3.13.0)
+      rspec-mocks (~> 3.13.0)
+    rspec-core (3.13.6)
+      rspec-support (~> 3.13.0)
+    rspec-expectations (3.13.5)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-mocks (3.13.8)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-support (3.13.7)
+
+PLATFORMS
+  arm64-darwin-22
+
+DEPENDENCIES
+  nokolexbor
+  rspec
+
+BUNDLED WITH
+   2.3.7
diff --git a/lib/google_carousel_parser_one.rb b/lib/google_carousel_parser_one.rb
@@ -0,0 +1,63 @@
+require "nokolexbor"
+
+class GoogleCarouselParserOne
+  def initialize(filepath)
+    @filepath = filepath
+  end
+
+  def parse
+    html_content = File.read(@filepath)
+    doc = Nokolexbor::HTML(html_content)
+
+    candidates = doc.css("a[href*='/search?']").select do |a|
+      a.css("img").any? && a.text.strip.length.positive?
+    end
+
+    ancestor_counts = candidates.each_with_object(Hash.new(0)) do |item, counts|
+      item.ancestors.each { |ancestor| counts[ancestor.path] += 1 }
+    end
+
+    carousel_container = candidates.first&.ancestors&.find do |ancestor|
+      ancestor_counts[ancestor.path] == candidates.size
+    end
+
+    if carousel_container
+      header_text = find_nearest_header(carousel_container)
+      category_key = header_text.split.last.downcase if header_text
+    end
+
+    items = candidates.map do |a|
+      text_nodes = a.css("::text").reject { |t| t.text.strip.empty? }
+
+      name = text_nodes.first&.text&.strip
+
+      href = a["href"]
+      link = href.start_with?("/") ? "https://www.google.com#{href}" : href
+
+      extensions = text_nodes.size > 1 ? text_nodes[1..-1].map { |t| t.text.strip } : []
+
+      {
+        "name" => name,
+        "link" => [link],
+        "extensions" => extensions
+      }
+    end
+
+    category_key ? { category_key => items } : {}
+  end
+
+  private
+
+  def find_nearest_header(node)
+    curr = node
+
+    while curr
+      headings = curr.parent&.css("h2, h3, [role='heading'], [data-attrid='title']")
+      header = headings&.find { |h| !h.text.strip.empty? }
+      return header.text.strip if header
+      curr = curr.parent
+    end
+
+    nil
+  end
+end
diff --git a/lib/google_carousel_parser_two.rb b/lib/google_carousel_parser_two.rb
@@ -0,0 +1,98 @@
+require "nokolexbor"
+
+class GoogleCarouselParserTwo
+  def initialize(filepath)
+    @filepath = filepath
+  end
+
+  def parse
+    html_content = File.read(@filepath)
+    doc = Nokolexbor::HTML(html_content)
+    images_map = extract_script_images(doc)
+
+    candidates = doc.css("a[href*='/search?']").select do |a|
+      a.css("img").any? && a.text.strip.length.positive?
+    end
+
+    ancestor_counts = candidates.each_with_object(Hash.new(0)) do |item, counts|
+      item.ancestors.each { |ancestor| counts[ancestor.path] += 1 }
+    end
+
+    carousel_container = candidates.first&.ancestors&.find do |ancestor|
+      ancestor_counts[ancestor.path] == candidates.size
+    end
+
+    if carousel_container
+      header_text = find_nearest_header(carousel_container)
+      category_key = header_text.split.last.downcase if header_text
+    end
+
+    items = candidates.map do |a|
+      text_nodes = a.css("::text").reject { |t| t.text.strip.empty? }
+
+      name = text_nodes.first&.text&.strip
+
+      href = a["href"]
+      link = href.start_with?("/") ? "https://www.google.com#{href}" : href
+
+      extensions = text_nodes.size > 1 ? text_nodes[1..-1].map { |t| t.text.strip } : []
+
+      img_node = a.css("img").first
+      image = images_map[img_node["id"]] || img_node["data-src"] if img_node
+
+      {
+        "name" => name,
+        "link" => link,
+        "extensions" => extensions,
+        "image" => image
+      }
+    end
+
+    category_key ? { category_key => items } : {}
+  end
+
+  private
+
+  def extract_script_images(doc)
+    map = {}
+
+    doc.css("script").each do |script|
+      code = script.text
+      next unless code.include?("base64")
+
+      image_data_match = code.match(/var\s+s\s*=\s*['"]([^'"]+)['"]/)
+      image_ids_match = code.match(/var\s+ii\s*=\s*\[([^\]]+)\]/)
+      next unless image_data_match && image_ids_match
+
+      base64_data = unescape_javascript_string(image_data_match[1])
+      image_ids = image_ids_match[1].scan(/['"]([^'"]+)['"]/).flatten
+
+      image_ids.each do |id|
+        map[id] = base64_data
+      end
+    end
+
+    map
+  end
+
+  def unescape_javascript_string(str)
+    # Convert hex escapes (like \x3d to = and \x2f to /)
+    str = str.gsub(/\\x([0-9a-fA-F]{2})/) { |m| $1.hex.chr }
+
+    # Remove escaped forward slashes (convert \/ to /)
+    str.gsub("\\/", "/")
+  end
+
+  def find_nearest_header(node)
+    curr = node
+
+    while curr
+      headings = curr.parent&.css("h2, h3, [role='heading'], [data-attrid='title']")
+      header = headings&.find { |h| !h.text.strip.empty? }
+      return header.text.strip if header
+      curr = curr.parent
+    end
+
+    nil
+  end
+end
diff --git a/spec/fixtures/barack-obama-books.html b/spec/fixtures/barack-obama-books.html
diff --git a/spec/fixtures/taylor-swift-albums.html b/spec/fixtures/taylor-swift-albums.html
diff --git a/spec/google_carousel_parser_one_spec.rb b/spec/google_carousel_parser_one_spec.rb
@@ -0,0 +1,66 @@
+require 'rspec'
+require 'json'
+require_relative '../lib/google_carousel_parser_one'
+
+describe GoogleCarouselParserOne do
+  shared_examples "a google carousel search result page" do |filepath, expected_key|
+    let(:parser) { GoogleCarouselParserOne.new(filepath) }
+    let(:result) { parser.parse }
+
+    it "returns a hash containing the expected category key array" do
+      expect(result).to be_a(Hash)
+      expect(result[expected_key]).to be_an(Array)
+      expect(result[expected_key]).not_to be_empty
+    end
+
+    it "validates that every item has a valid name, link array, and extensions array" do
+      result[expected_key].each do |item|
+        expect(item["name"]).to be_a(String)
+        expect(item["name"]).not_to be_empty
+
+        expect(item["link"]).to be_an(Array)
+        expect(item["link"]).not_to be_empty
+        item["link"].each do |url|
+          expect(url).to be_a(String)
+          expect(url).to start_with("https://www.google.com/search")
+        end
+
+        expect(item["extensions"]).to be_an(Array)
+        item["extensions"].each do |ext|
+          expect(ext).to be_a(String)
+        end
+
+        expect(item["image"]).to be_nil
+      end
+    end
+  end
+
+  describe "Van Gogh Paintings Carousel" do
+    it_behaves_like "a google carousel search result page", "files/van-gogh-paintings.html", "artworks"
+  end
+
+  describe "Barack Obama Books Carousel" do
+    it_behaves_like "a google carousel search result page", "spec/fixtures/barack-obama-books.html", "books"
+  end
+
+  describe "Taylor Swift Albums Carousel" do
+    it_behaves_like "a google carousel search result page", "spec/fixtures/taylor-swift-albums.html", "albums"
+  end
+
+  describe "Empty / No Carousel Page" do
+    let(:empty_file) { "spec/fixtures/empty.html" }
+
+    before do
+      File.write(empty_file, "<html><body><h1>No Carousel Here</h1></body></html>")
+    end
+
+    after do
+      File.delete(empty_file) if File.exist?(empty_file)
+    end
+
+    it "returns an empty hash" do
+      result = GoogleCarouselParserOne.new(empty_file).parse
+      expect(result).to eq({})
+    end
+  end
+end
diff --git a/spec/google_carousel_parser_two_spec.rb b/spec/google_carousel_parser_two_spec.rb
@@ -0,0 +1,80 @@
+require 'rspec'
+require 'json'
+require_relative '../lib/google_carousel_parser_two'
+
+describe GoogleCarouselParserTwo do
+  shared_examples "a google carousel search result page" do |filepath, expected_key|
+    let(:parser) { GoogleCarouselParserTwo.new(filepath) }
+    let(:result) { parser.parse }
+
+    it "returns a hash containing the expected category key array" do
+      expect(result).to be_a(Hash)
+      expect(result[expected_key]).to be_an(Array)
+      expect(result[expected_key]).not_to be_empty
+    end
+
+    it "validates that every item has a valid name, link, and optionally extensions/image" do
+      result[expected_key].each do |item|
+        expect(item["name"]).to be_a(String)
+        expect(item["name"]).not_to be_empty
+
+        expect(item["link"]).to be_a(String)
+        expect(item["link"]).to start_with("https://www.google.com/search")
+
+        expect(item["extensions"]).to be_an(Array)
+        item["extensions"].each do |ext|
+          expect(ext).to be_a(String)
+        end
+
+        if item["image"]
+          expect(item["image"]).to be_a(String)
+          expect(item["image"]).to start_with("data:image/").or start_with("https://encrypted-tbn")
+        end
+      end
+    end
+  end
+
+  describe "Van Gogh Paintings Carousel" do
+    it_behaves_like "a google carousel search result page", "files/van-gogh-paintings.html", "artworks"
+
+    it "exactly matches the expected JSON output" do
+      expected = JSON.parse(File.read("files/expected-array.json"))
+      actual = GoogleCarouselParserTwo.new("files/van-gogh-paintings.html").parse
+
+      expect(actual["artworks"].size).to eq(expected["artworks"].size)
+
+      actual["artworks"].each_with_index do |artwork, index|
+        exp = expected["artworks"][index]
+        expect(artwork["name"]).to eq(exp["name"])
+        expect(artwork["link"]).to eq(exp["link"])
+        expect(artwork["extensions"]).to eq(exp["extensions"] || [])
+        expect(artwork["image"]).to eq(exp["image"])
+      end
+    end
+  end
+
+  describe "Barack Obama Books Carousel" do
+    it_behaves_like "a google carousel search result page", "spec/fixtures/barack-obama-books.html", "books"
+  end
+
+  describe "Taylor Swift Albums Carousel" do
+    it_behaves_like "a google carousel search result page", "spec/fixtures/taylor-swift-albums.html", "albums"
+  end
+
+  describe "Empty / No Carousel Page" do
+    let(:empty_file) { "spec/fixtures/empty.html" }
+
+    before do
+      File.write(empty_file, "<html><body><h1>No Carousel Here</h1></body></html>")
+    end
+
+    after do
+      File.delete(empty_file) if File.exist?(empty_file)
+    end
+
+    it "returns an empty hash" do
+      result = GoogleCarouselParserTwo.new(empty_file).parse
+      expect(result).to eq({})
+    end
+  end
+end