diff --git a/Gemfile b/Gemfile new file mode 100644 index 00000000..b6d054a5 --- /dev/null +++ b/Gemfile @@ -0,0 +1,4 @@ +source "https://rubygems.org" + +gem "nokolexbor" +gem "rspec" diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 00000000..d1af578b --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,28 @@ +GEM + remote: https://rubygems.org/ + specs: + diff-lcs (1.6.2) + nokolexbor (0.7.0-arm64-darwin) + rspec (3.13.2) + rspec-core (~> 3.13.0) + rspec-expectations (~> 3.13.0) + rspec-mocks (~> 3.13.0) + rspec-core (3.13.6) + rspec-support (~> 3.13.0) + rspec-expectations (3.13.5) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-mocks (3.13.8) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-support (3.13.7) + +PLATFORMS + arm64-darwin-22 + +DEPENDENCIES + nokolexbor + rspec + +BUNDLED WITH + 2.3.7 diff --git a/lib/google_carousel_parser_one.rb b/lib/google_carousel_parser_one.rb new file mode 100644 index 00000000..0aeedd55 --- /dev/null +++ b/lib/google_carousel_parser_one.rb @@ -0,0 +1,63 @@ +require "nokolexbor" + +class GoogleCarouselParserOne + def initialize(filepath) + @filepath = filepath + end + + def parse + html_content = File.read(@filepath) + doc = Nokolexbor::HTML(html_content) + + candidates = doc.css("a[href*='/search?']").select do |a| + a.css("img").any? && a.text.strip.length.positive? + end + + ancestor_counts = candidates.each_with_object(Hash.new(0)) do |item, counts| + item.ancestors.each { |ancestor| counts[ancestor.path] += 1 } + end + + carousel_container = candidates.first&.ancestors&.find do |ancestor| + ancestor_counts[ancestor.path] == candidates.size + end + + if carousel_container + header_text = find_nearest_header(carousel_container) + category_key = header_text.split.last.downcase if header_text + end + + items = candidates.map do |a| + text_nodes = a.css("::text").reject { |t| t.text.strip.empty? } + + name = text_nodes.first&.text&.strip + + href = a["href"] + link = href.start_with?("/") ? "https://www.google.com#{href}" : href + + extensions = text_nodes.size > 1 ? text_nodes[1..-1].map { |t| t.text.strip } : [] + + { + "name" => name, + "link" => [link], + "extensions" => extensions + } + end + + category_key ? { category_key => items } : {} + end + + private + + def find_nearest_header(node) + curr = node + + while curr + headings = curr.parent&.css("h2, h3, [role='heading'], [data-attrid='title']") + header = headings&.find { |h| !h.text.strip.empty? } + return header.text.strip if header + curr = curr.parent + end + + nil + end +end diff --git a/lib/google_carousel_parser_two.rb b/lib/google_carousel_parser_two.rb new file mode 100644 index 00000000..866557e8 --- /dev/null +++ b/lib/google_carousel_parser_two.rb @@ -0,0 +1,98 @@ +require "nokolexbor" + +class GoogleCarouselParserTwo + def initialize(filepath) + @filepath = filepath + end + + def parse + html_content = File.read(@filepath) + doc = Nokolexbor::HTML(html_content) + images_map = extract_script_images(doc) + + candidates = doc.css("a[href*='/search?']").select do |a| + a.css("img").any? && a.text.strip.length.positive? + end + + ancestor_counts = candidates.each_with_object(Hash.new(0)) do |item, counts| + item.ancestors.each { |ancestor| counts[ancestor.path] += 1 } + end + + carousel_container = candidates.first&.ancestors&.find do |ancestor| + ancestor_counts[ancestor.path] == candidates.size + end + + if carousel_container + header_text = find_nearest_header(carousel_container) + category_key = header_text.split.last.downcase if header_text + end + + items = candidates.map do |a| + text_nodes = a.css("::text").reject { |t| t.text.strip.empty? } + + name = text_nodes.first&.text&.strip + + href = a["href"] + link = href.start_with?("/") ? "https://www.google.com#{href}" : href + + extensions = text_nodes.size > 1 ? text_nodes[1..-1].map { |t| t.text.strip } : [] + + img_node = a.css("img").first + image = images_map[img_node["id"]] || img_node["data-src"] if img_node + + { + "name" => name, + "link" => link, + "extensions" => extensions, + "image" => image + } + end + + category_key ? { category_key => items } : {} + end + + private + + def extract_script_images(doc) + map = {} + + doc.css("script").each do |script| + code = script.text + next unless code.include?("base64") + + image_data_match = code.match(/var\s+s\s*=\s*['"]([^'"]+)['"]/) + image_ids_match = code.match(/var\s+ii\s*=\s*\[([^\]]+)\]/) + next unless image_data_match && image_ids_match + + base64_data = unescape_javascript_string(image_data_match[1]) + image_ids = image_ids_match[1].scan(/['"]([^'"]+)['"]/).flatten + + image_ids.each do |id| + map[id] = base64_data + end + end + + map + end + + def unescape_javascript_string(str) + # Convert hex escapes (like \x3d to = and \x2f to /) + str = str.gsub(/\\x([0-9a-fA-F]{2})/) { |m| $1.hex.chr } + + # Remove escaped forward slashes (convert \/ to /) + str.gsub("\\/", "/") + end + + def find_nearest_header(node) + curr = node + + while curr + headings = curr.parent&.css("h2, h3, [role='heading'], [data-attrid='title']") + header = headings&.find { |h| !h.text.strip.empty? } + return header.text.strip if header + curr = curr.parent + end + + nil + end +end diff --git a/spec/fixtures/barack-obama-books.html b/spec/fixtures/barack-obama-books.html new file mode 100644 index 00000000..3f9c77e4 --- /dev/null +++ b/spec/fixtures/barack-obama-books.html @@ -0,0 +1,51 @@ + +barack obama books - Google Search
Skip to main contentAccessibility help

Search Results

Barack Obama
44th U.S. President
Google apps
Search Labs
Google Account
Edwin Lovera
cavatio203@gmail.com
\ No newline at end of file diff --git a/spec/fixtures/taylor-swift-albums.html b/spec/fixtures/taylor-swift-albums.html new file mode 100644 index 00000000..e854fa11 --- /dev/null +++ b/spec/fixtures/taylor-swift-albums.html @@ -0,0 +1,59 @@ + +Taylor Swift albums - Google Search
Skip to main contentAccessibility help

Search Results

Taylor Swift
American singer-songwriter
Google apps
Search Labs
Google Account
Edwin Lovera
cavatio203@gmail.com
\ No newline at end of file diff --git a/spec/google_carousel_parser_one_spec.rb b/spec/google_carousel_parser_one_spec.rb new file mode 100644 index 00000000..2fc75c28 --- /dev/null +++ b/spec/google_carousel_parser_one_spec.rb @@ -0,0 +1,66 @@ +require 'rspec' +require 'json' +require_relative '../lib/google_carousel_parser_one' + +describe GoogleCarouselParserOne do + shared_examples "a google carousel search result page" do |filepath, expected_key| + let(:parser) { GoogleCarouselParserOne.new(filepath) } + let(:result) { parser.parse } + + it "returns a hash containing the expected category key array" do + expect(result).to be_a(Hash) + expect(result[expected_key]).to be_an(Array) + expect(result[expected_key]).not_to be_empty + end + + it "validates that every item has a valid name, link array, and extensions array" do + result[expected_key].each do |item| + expect(item["name"]).to be_a(String) + expect(item["name"]).not_to be_empty + + expect(item["link"]).to be_an(Array) + expect(item["link"]).not_to be_empty + item["link"].each do |url| + expect(url).to be_a(String) + expect(url).to start_with("https://www.google.com/search") + end + + expect(item["extensions"]).to be_an(Array) + item["extensions"].each do |ext| + expect(ext).to be_a(String) + end + + expect(item["image"]).to be_nil + end + end + end + + describe "Van Gogh Paintings Carousel" do + it_behaves_like "a google carousel search result page", "files/van-gogh-paintings.html", "artworks" + end + + describe "Barack Obama Books Carousel" do + it_behaves_like "a google carousel search result page", "spec/fixtures/barack-obama-books.html", "books" + end + + describe "Taylor Swift Albums Carousel" do + it_behaves_like "a google carousel search result page", "spec/fixtures/taylor-swift-albums.html", "albums" + end + + describe "Empty / No Carousel Page" do + let(:empty_file) { "spec/fixtures/empty.html" } + + before do + File.write(empty_file, "

No Carousel Here

") + end + + after do + File.delete(empty_file) if File.exist?(empty_file) + end + + it "returns an empty hash" do + result = GoogleCarouselParserOne.new(empty_file).parse + expect(result).to eq({}) + end + end +end diff --git a/spec/google_carousel_parser_two_spec.rb b/spec/google_carousel_parser_two_spec.rb new file mode 100644 index 00000000..dadd4934 --- /dev/null +++ b/spec/google_carousel_parser_two_spec.rb @@ -0,0 +1,80 @@ +require 'rspec' +require 'json' +require_relative '../lib/google_carousel_parser_two' + +describe GoogleCarouselParserTwo do + shared_examples "a google carousel search result page" do |filepath, expected_key| + let(:parser) { GoogleCarouselParserTwo.new(filepath) } + let(:result) { parser.parse } + + it "returns a hash containing the expected category key array" do + expect(result).to be_a(Hash) + expect(result[expected_key]).to be_an(Array) + expect(result[expected_key]).not_to be_empty + end + + it "validates that every item has a valid name, link, and optionally extensions/image" do + result[expected_key].each do |item| + expect(item["name"]).to be_a(String) + expect(item["name"]).not_to be_empty + + expect(item["link"]).to be_a(String) + expect(item["link"]).to start_with("https://www.google.com/search") + + expect(item["extensions"]).to be_an(Array) + item["extensions"].each do |ext| + expect(ext).to be_a(String) + end + + if item["image"] + expect(item["image"]).to be_a(String) + expect(item["image"]).to start_with("data:image/").or start_with("https://encrypted-tbn") + end + end + end + end + + describe "Van Gogh Paintings Carousel" do + it_behaves_like "a google carousel search result page", "files/van-gogh-paintings.html", "artworks" + + it "exactly matches the expected JSON output" do + expected = JSON.parse(File.read("files/expected-array.json")) + actual = GoogleCarouselParserTwo.new("files/van-gogh-paintings.html").parse + + expect(actual["artworks"].size).to eq(expected["artworks"].size) + + actual["artworks"].each_with_index do |artwork, index| + exp = expected["artworks"][index] + expect(artwork["name"]).to eq(exp["name"]) + expect(artwork["link"]).to eq(exp["link"]) + expect(artwork["extensions"]).to eq(exp["extensions"] || []) + expect(artwork["image"]).to eq(exp["image"]) + end + end + end + + describe "Barack Obama Books Carousel" do + it_behaves_like "a google carousel search result page", "spec/fixtures/barack-obama-books.html", "books" + end + + describe "Taylor Swift Albums Carousel" do + it_behaves_like "a google carousel search result page", "spec/fixtures/taylor-swift-albums.html", "albums" + end + + describe "Empty / No Carousel Page" do + let(:empty_file) { "spec/fixtures/empty.html" } + + before do + File.write(empty_file, "

No Carousel Here

") + end + + after do + File.delete(empty_file) if File.exist?(empty_file) + end + + it "returns an empty hash" do + result = GoogleCarouselParserTwo.new(empty_file).parse + expect(result).to eq({}) + end + end +end