Testing Documentation Scraper Implementation in DevDocs
This test suite validates the core scraping functionality in the DevDocs documentation scraper implementation. It comprehensively tests the scraper’s ability to handle URL processing, page building, and content filtering while ensuring proper inheritance and configuration management.
Test Coverage Overview
Implementation Analysis
Technical Details
Best Practices Demonstrated
freecodecamp/devdocs
test/lib/docs/core/scraper_test.rb
require_relative '../../../test_helper'
require_relative '../../../../lib/docs'
class DocsScraperTest < Minitest::Spec
class Scraper < Docs::Scraper
self.type = 'scraper'
self.base_url = 'http://example.com/'
self.root_path = '/root'
self.initial_paths = ['/initial']
self.html_filters = Docs::FilterStack.new
self.text_filters = Docs::FilterStack.new
end
let :scraper do
Scraper.new.tap do |scraper|
scraper.extend FakeInstrumentation
end
end
let :response do
Struct.new(:body, :url).new
end
describe ".inherited" do
let :subclass do
Class.new Scraper
end
it "sets .type" do
assert_equal Scraper.type, subclass.type
end
it "sets .root_path" do
assert_equal Scraper.root_path, subclass.root_path
end
it "duplicates .initial_paths" do
stub(Scraper).initial_paths { ['path'] }
assert_equal Scraper.initial_paths, subclass.initial_paths
refute_same Scraper.initial_paths, subclass.initial_paths
end
it "duplicates .options" do
stub(Scraper).options { { test: [] } }
assert_equal Scraper.options, subclass.options
refute_same Scraper.options, subclass.options
refute_same Scraper.options[:test], subclass.options[:test]
end
it "duplicates .html_filters" do
assert_equal Scraper.html_filters, subclass.html_filters
refute_same Scraper.html_filters, subclass.html_filters
end
it "duplicates .text_filters" do
assert_equal Scraper.text_filters, subclass.text_filters
refute_same Scraper.text_filters, subclass.text_filters
end
end
describe ".filters" do
it "returns the union of .html_filters and .text_filters" do
stub(Scraper.html_filters).to_a { [1] }
stub(Scraper.text_filters).to_a { [2] }
assert_equal [1, 2], Scraper.filters
end
end
describe "#root_path?" do
it "returns false when .root_path is blank" do
stub(Scraper).root_path { '' }
refute scraper.root_path?
end
it "returns false when .root_path is '/'" do
stub(Scraper).root_path { '/' }
refute scraper.root_path?
end
it "returns true when .root_path is '/path'" do
stub(Scraper).root_path { '/path' }
assert scraper.root_path?
end
end
describe "#root_url" do
let :root_url do
scraper.root_url
end
context "when #root_path? is false" do
before do
stub(scraper).root_path? { false }
end
it "returns a memoized Docs::URL" do
assert_instance_of Docs::URL, root_url
assert_same root_url, scraper.root_url
end
it "returns the normalized .base_url" do
stub(Scraper).base_url { 'http://example.com' }
assert_equal 'http://example.com/', root_url.to_s
end
end
context "when #root_path? is true" do
before do
stub(scraper).root_path? { true }
end
it "returns a memoized Docs::URL" do
assert_instance_of Docs::URL, root_url
assert_same root_url, scraper.root_url
end
it "returns .base_url + .root_path" do
stub(Scraper).base_url { 'http://example.com/path/' }
stub(Scraper).root_path { '/root' }
assert_equal 'http://example.com/path/root', root_url.to_s
end
end
end
describe "#initial_urls" do
let :initial_urls do
scraper.initial_urls
end
it "returns a frozen, memoized Array" do
assert_instance_of Array, initial_urls
assert initial_urls.frozen?
assert_same initial_urls, scraper.initial_urls
end
it "includes the #root_url" do
assert_includes initial_urls, scraper.root_url.to_s
end
it "includes the .initial_paths converted to urls" do
stub(Scraper).base_url { 'http://example.com/' }
stub(Scraper).initial_paths { ['one', '/two'] }
assert_includes initial_urls, 'http://example.com/one'
assert_includes initial_urls, 'http://example.com/two'
end
end
describe "#build_page" do
before do
stub(scraper).handle_response
end
it "requires a path" do
assert_raises ArgumentError do
scraper.build_page
end
end
context "with a blank path" do
it "requests the root url" do
mock(scraper).request_one(scraper.root_url.to_s)
scraper.build_page ''
end
end
context "with '/'" do
it "requests the root url" do
mock(scraper).request_one(scraper.root_url.to_s)
scraper.build_page '/'
end
end
context "with '/file'" do
it "requests 'example.com/file' when the base url is 'example.com" do
stub(Scraper).base_url { 'http://example.com' }
mock(scraper).request_one 'http://example.com/file'
scraper.build_page '/file'
end
it "requests 'example.com/file' when the base url is 'example.com/" do
stub(Scraper).base_url { 'http://example.com/' }
mock(scraper).request_one 'http://example.com/file'
scraper.build_page '/file'
end
end
it "returns the processed response" do
stub(scraper).request_one { response }
mock(scraper).handle_response(response) { 'test' }
assert_equal 'test', scraper.build_page('')
end
it "yields the processed response" do
stub(scraper).request_one { response }
stub(scraper).handle_response(response) { 'test' }
scraper.build_page('') { |arg| @arg = arg }
assert @arg
assert_equal 'test', @arg
end
end
describe "#build_pages" do
let :block do
Proc.new {}
end
let :processed_response do
Hash.new
end
it "requests the #initial_urls" do
mock(scraper).request_all(scraper.initial_urls)
scraper.build_pages(&block)
end
it "instruments 'running'" do
stub(scraper).request_all
scraper.build_pages(&block)
assert scraper.last_instrumentation
assert_equal 'running.scraper', scraper.last_instrumentation[:event]
assert_equal scraper.initial_urls, scraper.last_instrumentation[:payload][:urls]
end
context "when the response is processable" do
before do
stub(scraper).request_all do |urls, &block|
urls.each { |url| @next_urls ||= block.call(response) }
end
stub(scraper).handle_response(response) { processed_response }
end
it "yields the processed response" do
scraper.build_pages { |arg| @arg = arg }
assert_same processed_response, @arg
end
context "when :internal_urls is empty" do
before do
processed_response[:internal_urls] = []
end
it "requests nothing more" do
scraper.build_pages(&block)
assert_nil @next_urls
end
it "doesn't instrument 'queued'" do
scraper.build_pages(&block)
refute_equal 'queued.scraper', scraper.last_instrumentation.try(:[], :event)
end
end
context "when :internal_urls isn't empty" do
let :internal_urls do
['Url']
end
before do
processed_response[:internal_urls] = internal_urls
end
it "requests the urls" do
scraper.build_pages(&block)
assert_equal internal_urls, @next_urls
end
it "doesn't request the same url twice irrespective of case" do
processed_response[:internal_urls] = scraper.initial_urls.map(&:swapcase)
scraper.build_pages(&block)
assert_empty @next_urls
end
it "instruments 'queued'" do
scraper.build_pages(&block)
assert scraper.last_instrumentation
assert_equal 'queued.scraper', scraper.last_instrumentation[:event]
assert_equal internal_urls, scraper.last_instrumentation[:payload][:urls]
end
end
end
context "when the response isn't processable" do
it "doesn't yield" do
stub(scraper).request_all.yields(response)
stub(scraper).handle_response(response) { nil }
scraper.build_pages { @yield = true }
refute @yield
end
end
end
describe "#options" do
let :options do
Hash.new
end
let :result do
scraper.options
end
before do
stub(Scraper).options { options }
end
it "returns a frozen, memoized Hash" do
assert_instance_of Hash, result
assert result.frozen?
assert_same result, scraper.options
end
it "includes .options" do
options[:test] = true
assert result[:test]
end
it "includes #base_url" do
assert_equal scraper.base_url, result[:base_url]
end
it "includes #root_url" do
assert_equal scraper.root_url, result[:root_url]
end
it "includes #root_path" do
assert_equal '/root', result[:root_path]
end
it "includes #initial_paths" do
assert_equal ['/initial'], result[:initial_paths]
end
it "adds #initial_paths to :only when it is an array" do
options[:only] = ['/path']
assert_includes result[:only], options[:only].first
assert_includes result[:only], '/initial'
end
it "adds #initial_paths to :only when :only_patterns is an array" do
options[:only_patterns] = []
assert_includes result[:only], '/initial'
end
it "doesn't modify :only in place" do
options[:only] = []
result
assert_empty options[:only]
end
context "when #root_path? is false" do
before do
stub(scraper).root_path? { false }
end
it "doesn't modify :skip" do
options[:skip] = []
assert_equal options[:skip], result[:skip]
end
it "adds '' and '/' to :only when it is an array" do
options[:only] = ['/path']
assert_includes result[:only], options[:only].first
assert_includes result[:only], ''
assert_includes result[:only], '/'
end
it "adds '' and '/' to :only when :only_patterns is an array" do
options[:only_patterns] = []
assert_includes result[:only], ''
assert_includes result[:only], '/'
end
it "doesn't modify :only in place" do
options[:only] = []
result
assert_empty options[:only]
end
end
context "when #root_path? is true" do
before do
stub(scraper).root_path? { true }
end
it "adds '' and '/' to :skip when it is nil" do
assert_includes result[:skip], ''
assert_includes result[:skip], '/'
end
it "adds '' and '/' to :skip when it is an array" do
options[:skip] = ['/path']
assert_includes result[:skip], options[:skip].first
assert_includes result[:skip], ''
assert_includes result[:skip], '/'
end
it "doesn't modify :skip in place" do
options[:skip] = []
result
assert_empty options[:skip]
end
it "adds #root_path to :only when it is an array" do
options[:only] = ['/path']
assert_includes result[:only], options[:only].first
assert_includes result[:only], '/root'
end
it "adds #root_path to :only when :only_patterns is an array" do
options[:only_patterns] = []
assert_includes result[:only], '/root'
end
end
end
describe "#handle_response" do
let :result do
scraper.send :handle_response, response
end
context "when the response is processable" do
before do
stub(scraper).process_response?(response) { true }
end
it "runs the pipeline" do
mock(scraper.pipeline).call.with_any_args
result
end
it "returns the result" do
stub(scraper.pipeline).call { |_, _, result| result[:test] = true }
assert result[:test]
end
it "instruments 'process_response'" do
result
assert scraper.last_instrumentation
assert_equal 'process_response.scraper', scraper.last_instrumentation[:event]
assert_equal response, scraper.last_instrumentation[:payload][:response]
end
context "the pipeline document" do
it "is the parsed response body" do
response.body = 'body'
stub(scraper.pipeline).call { |arg| @arg = arg }
mock.proxy(Docs::Parser).new('body') { |parser| stub(parser).html { 'html' } }
result
assert_equal 'html', @arg
end
end
context "the pipeline context" do
let :context do
stub(scraper.pipeline).call { |_, arg| @arg = arg }
result
@arg
end
it "includes #options" do
stub(scraper).options { { test: true } }
assert context[:test]
end
it "includes the response url" do
response.url = 'url'
assert_equal 'url', context[:url]
end
end
end
context "when the response isn't processable" do
before do
stub(scraper).process_response?(response) { false }
end
it "doesn't run the pipeline" do
dont_allow(scraper.pipeline).call
result
end
it "returns nil" do
assert_nil result
end
it "instruments 'ignore_response'" do
result
assert scraper.last_instrumentation
assert_equal 'ignore_response.scraper', scraper.last_instrumentation[:event]
assert_equal response, scraper.last_instrumentation[:payload][:response]
end
end
end
describe "#pipeline" do
let :pipeline do
scraper.pipeline
end
it "returns a memoized HTML::Pipeline" do
assert_instance_of ::HTML::Pipeline, pipeline
assert_same pipeline, scraper.pipeline
end
it "returns a pipeline with the filters stored in .filters" do
stub(Scraper).filters { [1] }
assert_equal Scraper.filters, pipeline.filters
end
it "returns a pipeline with Docs as instrumentation service" do
assert_equal Docs, pipeline.instrumentation_service
end
end
end