Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
source "https://rubygems.org"

gem "nokolexbor"
gem "rspec"
28 changes: 28 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
GEM
remote: https://rubygems.org/
specs:
diff-lcs (1.6.2)
nokolexbor (0.7.0-arm64-darwin)
rspec (3.13.2)
rspec-core (~> 3.13.0)
rspec-expectations (~> 3.13.0)
rspec-mocks (~> 3.13.0)
rspec-core (3.13.6)
rspec-support (~> 3.13.0)
rspec-expectations (3.13.5)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-mocks (3.13.8)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-support (3.13.7)

PLATFORMS
arm64-darwin-22

DEPENDENCIES
nokolexbor
rspec

BUNDLED WITH
2.3.7
63 changes: 63 additions & 0 deletions lib/google_carousel_parser_one.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
require "nokolexbor"

class GoogleCarouselParserOne
def initialize(filepath)
@filepath = filepath
end

def parse
html_content = File.read(@filepath)
doc = Nokolexbor::HTML(html_content)

candidates = doc.css("a[href*='/search?']").select do |a|
a.css("img").any? && a.text.strip.length.positive?
end

ancestor_counts = candidates.each_with_object(Hash.new(0)) do |item, counts|
item.ancestors.each { |ancestor| counts[ancestor.path] += 1 }
end

carousel_container = candidates.first&.ancestors&.find do |ancestor|
ancestor_counts[ancestor.path] == candidates.size
end

if carousel_container
header_text = find_nearest_header(carousel_container)
category_key = header_text.split.last.downcase if header_text
end

items = candidates.map do |a|
text_nodes = a.css("::text").reject { |t| t.text.strip.empty? }

name = text_nodes.first&.text&.strip

href = a["href"]
link = href.start_with?("/") ? "https://www.google.com#{href}" : href

extensions = text_nodes.size > 1 ? text_nodes[1..-1].map { |t| t.text.strip } : []

{
"name" => name,
"link" => [link],
"extensions" => extensions
}
end

category_key ? { category_key => items } : {}
end

private

def find_nearest_header(node)
curr = node

while curr
headings = curr.parent&.css("h2, h3, [role='heading'], [data-attrid='title']")
header = headings&.find { |h| !h.text.strip.empty? }
return header.text.strip if header
curr = curr.parent
end

nil
end
end
98 changes: 98 additions & 0 deletions lib/google_carousel_parser_two.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
require "nokolexbor"

class GoogleCarouselParserTwo
def initialize(filepath)
@filepath = filepath
end

def parse
html_content = File.read(@filepath)
doc = Nokolexbor::HTML(html_content)
images_map = extract_script_images(doc)

candidates = doc.css("a[href*='/search?']").select do |a|
a.css("img").any? && a.text.strip.length.positive?
end

ancestor_counts = candidates.each_with_object(Hash.new(0)) do |item, counts|
item.ancestors.each { |ancestor| counts[ancestor.path] += 1 }
end

carousel_container = candidates.first&.ancestors&.find do |ancestor|
ancestor_counts[ancestor.path] == candidates.size
end

if carousel_container
header_text = find_nearest_header(carousel_container)
category_key = header_text.split.last.downcase if header_text
end

items = candidates.map do |a|
text_nodes = a.css("::text").reject { |t| t.text.strip.empty? }

name = text_nodes.first&.text&.strip

href = a["href"]
link = href.start_with?("/") ? "https://www.google.com#{href}" : href

extensions = text_nodes.size > 1 ? text_nodes[1..-1].map { |t| t.text.strip } : []

img_node = a.css("img").first
image = images_map[img_node["id"]] || img_node["data-src"] if img_node

{
"name" => name,
"link" => link,
"extensions" => extensions,
"image" => image
}
end

category_key ? { category_key => items } : {}
end

private

def extract_script_images(doc)
map = {}

doc.css("script").each do |script|
code = script.text
next unless code.include?("base64")

image_data_match = code.match(/var\s+s\s*=\s*['"]([^'"]+)['"]/)
image_ids_match = code.match(/var\s+ii\s*=\s*\[([^\]]+)\]/)
next unless image_data_match && image_ids_match

base64_data = unescape_javascript_string(image_data_match[1])
image_ids = image_ids_match[1].scan(/['"]([^'"]+)['"]/).flatten

image_ids.each do |id|
map[id] = base64_data
end
end

map
end

def unescape_javascript_string(str)
# Convert hex escapes (like \x3d to = and \x2f to /)
str = str.gsub(/\\x([0-9a-fA-F]{2})/) { |m| $1.hex.chr }

# Remove escaped forward slashes (convert \/ to /)
str.gsub("\\/", "/")
end

def find_nearest_header(node)
curr = node

while curr
headings = curr.parent&.css("h2, h3, [role='heading'], [data-attrid='title']")
header = headings&.find { |h| !h.text.strip.empty? }
return header.text.strip if header
curr = curr.parent
end

nil
end
end
51 changes: 51 additions & 0 deletions spec/fixtures/barack-obama-books.html

Large diffs are not rendered by default.

59 changes: 59 additions & 0 deletions spec/fixtures/taylor-swift-albums.html

Large diffs are not rendered by default.

66 changes: 66 additions & 0 deletions spec/google_carousel_parser_one_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
require 'rspec'
require 'json'
require_relative '../lib/google_carousel_parser_one'

describe GoogleCarouselParserOne do
shared_examples "a google carousel search result page" do |filepath, expected_key|
let(:parser) { GoogleCarouselParserOne.new(filepath) }
let(:result) { parser.parse }

it "returns a hash containing the expected category key array" do
expect(result).to be_a(Hash)
expect(result[expected_key]).to be_an(Array)
expect(result[expected_key]).not_to be_empty
end

it "validates that every item has a valid name, link array, and extensions array" do
result[expected_key].each do |item|
expect(item["name"]).to be_a(String)
expect(item["name"]).not_to be_empty

expect(item["link"]).to be_an(Array)
expect(item["link"]).not_to be_empty
item["link"].each do |url|
expect(url).to be_a(String)
expect(url).to start_with("https://www.google.com/search")
end

expect(item["extensions"]).to be_an(Array)
item["extensions"].each do |ext|
expect(ext).to be_a(String)
end

expect(item["image"]).to be_nil
end
end
end

describe "Van Gogh Paintings Carousel" do
it_behaves_like "a google carousel search result page", "files/van-gogh-paintings.html", "artworks"
end

describe "Barack Obama Books Carousel" do
it_behaves_like "a google carousel search result page", "spec/fixtures/barack-obama-books.html", "books"
end

describe "Taylor Swift Albums Carousel" do
it_behaves_like "a google carousel search result page", "spec/fixtures/taylor-swift-albums.html", "albums"
end

describe "Empty / No Carousel Page" do
let(:empty_file) { "spec/fixtures/empty.html" }

before do
File.write(empty_file, "<html><body><h1>No Carousel Here</h1></body></html>")
end

after do
File.delete(empty_file) if File.exist?(empty_file)
end

it "returns an empty hash" do
result = GoogleCarouselParserOne.new(empty_file).parse
expect(result).to eq({})
end
end
end
80 changes: 80 additions & 0 deletions spec/google_carousel_parser_two_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
require 'rspec'
require 'json'
require_relative '../lib/google_carousel_parser_two'

describe GoogleCarouselParserTwo do
shared_examples "a google carousel search result page" do |filepath, expected_key|
let(:parser) { GoogleCarouselParserTwo.new(filepath) }
let(:result) { parser.parse }

it "returns a hash containing the expected category key array" do
expect(result).to be_a(Hash)
expect(result[expected_key]).to be_an(Array)
expect(result[expected_key]).not_to be_empty
end

it "validates that every item has a valid name, link, and optionally extensions/image" do
result[expected_key].each do |item|
expect(item["name"]).to be_a(String)
expect(item["name"]).not_to be_empty

expect(item["link"]).to be_a(String)
expect(item["link"]).to start_with("https://www.google.com/search")

expect(item["extensions"]).to be_an(Array)
item["extensions"].each do |ext|
expect(ext).to be_a(String)
end

if item["image"]
expect(item["image"]).to be_a(String)
expect(item["image"]).to start_with("data:image/").or start_with("https://encrypted-tbn")
end
end
end
end

describe "Van Gogh Paintings Carousel" do
it_behaves_like "a google carousel search result page", "files/van-gogh-paintings.html", "artworks"

it "exactly matches the expected JSON output" do
expected = JSON.parse(File.read("files/expected-array.json"))
actual = GoogleCarouselParserTwo.new("files/van-gogh-paintings.html").parse

expect(actual["artworks"].size).to eq(expected["artworks"].size)

actual["artworks"].each_with_index do |artwork, index|
exp = expected["artworks"][index]
expect(artwork["name"]).to eq(exp["name"])
expect(artwork["link"]).to eq(exp["link"])
expect(artwork["extensions"]).to eq(exp["extensions"] || [])
expect(artwork["image"]).to eq(exp["image"])
end
end
end

describe "Barack Obama Books Carousel" do
it_behaves_like "a google carousel search result page", "spec/fixtures/barack-obama-books.html", "books"
end

describe "Taylor Swift Albums Carousel" do
it_behaves_like "a google carousel search result page", "spec/fixtures/taylor-swift-albums.html", "albums"
end

describe "Empty / No Carousel Page" do
let(:empty_file) { "spec/fixtures/empty.html" }

before do
File.write(empty_file, "<html><body><h1>No Carousel Here</h1></body></html>")
end

after do
File.delete(empty_file) if File.exist?(empty_file)
end

it "returns an empty hash" do
result = GoogleCarouselParserTwo.new(empty_file).parse
expect(result).to eq({})
end
end
end