Back to Repositories

Testing Documentation Scraper Implementation in DevDocs

This test suite validates the core scraping functionality in the DevDocs documentation scraper implementation. It comprehensively tests the scraper’s ability to handle URL processing, page building, and content filtering while ensuring proper inheritance and configuration management.

Test Coverage Overview

The test suite provides extensive coverage of the Docs::Scraper class functionality, focusing on critical scraping operations.

Key areas tested include:

URL handling and path normalization
Page building and processing
Response handling and filtering
Configuration inheritance and management
Pipeline setup and execution

Edge cases covered include blank paths, root URL handling, and various URL format variations.

Implementation Analysis

The testing approach utilizes RSpec-style syntax with Minitest::Spec, implementing systematic validation of the scraper’s core components. The tests follow a behavior-driven development pattern, using mocks and stubs effectively to isolate functionality.

Key patterns include:

Inheritance verification for scraper configuration
URL processing and normalization checks
Response handling pipeline validation
Options management and merging tests

Technical Details

Testing tools and configuration:

Minitest::Spec as the testing framework
Mock/stub functionality for isolation
HTML::Pipeline integration tests
Custom instrumentation for monitoring
Structured test helpers and shared contexts

Best Practices Demonstrated

The test suite exemplifies high-quality testing practices through comprehensive coverage and well-structured test organization. Notable practices include:

Isolated unit tests with proper mocking
Thorough edge case coverage
Clear test descriptions and organization
Effective use of setup and teardown
Consistent testing patterns across functionality

freecodecamp/devdocs

test/lib/docs/core/scraper_test.rb

            
require_relative '../../../test_helper'
require_relative '../../../../lib/docs'

class DocsScraperTest < Minitest::Spec
  class Scraper < Docs::Scraper
    self.type = 'scraper'
    self.base_url = 'http://example.com/'
    self.root_path = '/root'
    self.initial_paths = ['/initial']
    self.html_filters = Docs::FilterStack.new
    self.text_filters = Docs::FilterStack.new
  end

  let :scraper do
    Scraper.new.tap do |scraper|
      scraper.extend FakeInstrumentation
    end
  end

  let :response do
    Struct.new(:body, :url).new
  end

  describe ".inherited" do
    let :subclass do
      Class.new Scraper
    end

    it "sets .type" do
      assert_equal Scraper.type, subclass.type
    end

    it "sets .root_path" do
      assert_equal Scraper.root_path, subclass.root_path
    end

    it "duplicates .initial_paths" do
      stub(Scraper).initial_paths { ['path'] }
      assert_equal Scraper.initial_paths, subclass.initial_paths
      refute_same Scraper.initial_paths, subclass.initial_paths
    end

    it "duplicates .options" do
      stub(Scraper).options { { test: [] } }
      assert_equal Scraper.options, subclass.options
      refute_same Scraper.options, subclass.options
      refute_same Scraper.options[:test], subclass.options[:test]
    end

    it "duplicates .html_filters" do
      assert_equal Scraper.html_filters, subclass.html_filters
      refute_same Scraper.html_filters, subclass.html_filters
    end

    it "duplicates .text_filters" do
      assert_equal Scraper.text_filters, subclass.text_filters
      refute_same Scraper.text_filters, subclass.text_filters
    end
  end

  describe ".filters" do
    it "returns the union of .html_filters and .text_filters" do
      stub(Scraper.html_filters).to_a { [1] }
      stub(Scraper.text_filters).to_a { [2] }
      assert_equal [1, 2], Scraper.filters
    end
  end

  describe "#root_path?" do
    it "returns false when .root_path is blank" do
      stub(Scraper).root_path { '' }
      refute scraper.root_path?
    end

    it "returns false when .root_path is '/'" do
      stub(Scraper).root_path { '/' }
      refute scraper.root_path?
    end

    it "returns true when .root_path is '/path'" do
      stub(Scraper).root_path { '/path' }
      assert scraper.root_path?
    end
  end

  describe "#root_url" do
    let :root_url do
      scraper.root_url
    end

    context "when #root_path? is false" do
      before do
        stub(scraper).root_path? { false }
      end

      it "returns a memoized Docs::URL" do
        assert_instance_of Docs::URL, root_url
        assert_same root_url, scraper.root_url
      end

      it "returns the normalized .base_url" do
        stub(Scraper).base_url { 'http://example.com' }
        assert_equal 'http://example.com/', root_url.to_s
      end
    end

    context "when #root_path? is true" do
      before do
        stub(scraper).root_path? { true }
      end

      it "returns a memoized Docs::URL" do
        assert_instance_of Docs::URL, root_url
        assert_same root_url, scraper.root_url
      end

      it "returns .base_url + .root_path" do
        stub(Scraper).base_url { 'http://example.com/path/' }
        stub(Scraper).root_path { '/root' }
        assert_equal 'http://example.com/path/root', root_url.to_s
      end
    end
  end

  describe "#initial_urls" do
    let :initial_urls do
      scraper.initial_urls
    end

    it "returns a frozen, memoized Array" do
      assert_instance_of Array, initial_urls
      assert initial_urls.frozen?
      assert_same initial_urls, scraper.initial_urls
    end

    it "includes the #root_url" do
      assert_includes initial_urls, scraper.root_url.to_s
    end

    it "includes the .initial_paths converted to urls" do
      stub(Scraper).base_url { 'http://example.com/' }
      stub(Scraper).initial_paths { ['one', '/two'] }
      assert_includes initial_urls, 'http://example.com/one'
      assert_includes initial_urls, 'http://example.com/two'
    end
  end

  describe "#build_page" do
    before do
      stub(scraper).handle_response
    end

    it "requires a path" do
      assert_raises ArgumentError do
        scraper.build_page
      end
    end

    context "with a blank path" do
      it "requests the root url" do
        mock(scraper).request_one(scraper.root_url.to_s)
        scraper.build_page ''
      end
    end

    context "with '/'" do
      it "requests the root url" do
        mock(scraper).request_one(scraper.root_url.to_s)
        scraper.build_page '/'
      end
    end

    context "with '/file'" do
      it "requests 'example.com/file' when the base url is 'example.com" do
        stub(Scraper).base_url { 'http://example.com' }
        mock(scraper).request_one 'http://example.com/file'
        scraper.build_page '/file'
      end

      it "requests 'example.com/file' when the base url is 'example.com/" do
        stub(Scraper).base_url { 'http://example.com/' }
        mock(scraper).request_one 'http://example.com/file'
        scraper.build_page '/file'
      end
    end

    it "returns the processed response" do
      stub(scraper).request_one { response }
      mock(scraper).handle_response(response) { 'test' }
      assert_equal 'test', scraper.build_page('')
    end

    it "yields the processed response" do
      stub(scraper).request_one { response }
      stub(scraper).handle_response(response) { 'test' }
      scraper.build_page('') { |arg| @arg = arg }
      assert @arg
      assert_equal 'test', @arg
    end
  end

  describe "#build_pages" do
    let :block do
      Proc.new {}
    end

    let :processed_response do
      Hash.new
    end

    it "requests the #initial_urls" do
      mock(scraper).request_all(scraper.initial_urls)
      scraper.build_pages(&block)
    end

    it "instruments 'running'" do
      stub(scraper).request_all
      scraper.build_pages(&block)
      assert scraper.last_instrumentation
      assert_equal 'running.scraper', scraper.last_instrumentation[:event]
      assert_equal scraper.initial_urls, scraper.last_instrumentation[:payload][:urls]
    end

    context "when the response is processable" do
      before do
        stub(scraper).request_all do |urls, &block|
          urls.each { |url| @next_urls ||= block.call(response) }
        end
        stub(scraper).handle_response(response) { processed_response }
      end

      it "yields the processed response" do
        scraper.build_pages { |arg| @arg = arg }
        assert_same processed_response, @arg
      end

      context "when :internal_urls is empty" do
        before do
          processed_response[:internal_urls] = []
        end

        it "requests nothing more" do
          scraper.build_pages(&block)
          assert_nil @next_urls
        end

        it "doesn't instrument 'queued'" do
          scraper.build_pages(&block)
          refute_equal 'queued.scraper', scraper.last_instrumentation.try(:[], :event)
        end
      end

      context "when :internal_urls isn't empty" do
        let :internal_urls do
          ['Url']
        end

        before do
          processed_response[:internal_urls] = internal_urls
        end

        it "requests the urls" do
          scraper.build_pages(&block)
          assert_equal internal_urls, @next_urls
        end

        it "doesn't request the same url twice irrespective of case" do
          processed_response[:internal_urls] = scraper.initial_urls.map(&:swapcase)
          scraper.build_pages(&block)
          assert_empty @next_urls
        end

        it "instruments 'queued'" do
          scraper.build_pages(&block)
          assert scraper.last_instrumentation
          assert_equal 'queued.scraper', scraper.last_instrumentation[:event]
          assert_equal internal_urls, scraper.last_instrumentation[:payload][:urls]
        end
      end
    end

    context "when the response isn't processable" do
      it "doesn't yield" do
        stub(scraper).request_all.yields(response)
        stub(scraper).handle_response(response) { nil }
        scraper.build_pages { @yield = true }
        refute @yield
      end
    end
  end

  describe "#options" do
    let :options do
      Hash.new
    end

    let :result do
      scraper.options
    end

    before do
      stub(Scraper).options { options }
    end

    it "returns a frozen, memoized Hash" do
      assert_instance_of Hash, result
      assert result.frozen?
      assert_same result, scraper.options
    end

    it "includes .options" do
      options[:test] = true
      assert result[:test]
    end

    it "includes #base_url" do
      assert_equal scraper.base_url, result[:base_url]
    end

    it "includes #root_url" do
      assert_equal scraper.root_url, result[:root_url]
    end

    it "includes #root_path" do
      assert_equal '/root', result[:root_path]
    end

    it "includes #initial_paths" do
      assert_equal ['/initial'], result[:initial_paths]
    end

    it "adds #initial_paths to :only when it is an array" do
      options[:only] = ['/path']
      assert_includes result[:only], options[:only].first
      assert_includes result[:only], '/initial'
    end

    it "adds #initial_paths to :only when :only_patterns is an array" do
      options[:only_patterns] = []
      assert_includes result[:only], '/initial'
    end

    it "doesn't modify :only in place" do
      options[:only] = []
      result
      assert_empty options[:only]
    end

    context "when #root_path? is false" do
      before do
        stub(scraper).root_path? { false }
      end

      it "doesn't modify :skip" do
        options[:skip] = []
        assert_equal options[:skip], result[:skip]
      end

      it "adds '' and '/' to :only when it is an array" do
        options[:only] = ['/path']
        assert_includes result[:only], options[:only].first
        assert_includes result[:only], ''
        assert_includes result[:only], '/'
      end

      it "adds '' and '/' to :only when :only_patterns is an array" do
        options[:only_patterns] = []
        assert_includes result[:only], ''
        assert_includes result[:only], '/'
      end

      it "doesn't modify :only in place" do
        options[:only] = []
        result
        assert_empty options[:only]
      end
    end

    context "when #root_path? is true" do
      before do
        stub(scraper).root_path? { true }
      end

      it "adds '' and '/' to :skip when it is nil" do
        assert_includes result[:skip], ''
        assert_includes result[:skip], '/'
      end

      it "adds '' and '/' to :skip when it is an array" do
        options[:skip] = ['/path']
        assert_includes result[:skip], options[:skip].first
        assert_includes result[:skip], ''
        assert_includes result[:skip], '/'
      end

      it "doesn't modify :skip in place" do
        options[:skip] = []
        result
        assert_empty options[:skip]
      end

      it "adds #root_path to :only when it is an array" do
        options[:only] = ['/path']
        assert_includes result[:only], options[:only].first
        assert_includes result[:only], '/root'
      end

      it "adds #root_path to :only when :only_patterns is an array" do
        options[:only_patterns] = []
        assert_includes result[:only], '/root'
      end
    end
  end

  describe "#handle_response" do
    let :result do
      scraper.send :handle_response, response
    end

    context "when the response is processable" do
      before do
        stub(scraper).process_response?(response) { true }
      end

      it "runs the pipeline" do
        mock(scraper.pipeline).call.with_any_args
        result
      end

      it "returns the result" do
        stub(scraper.pipeline).call { |_, _, result| result[:test] = true }
        assert result[:test]
      end

      it "instruments 'process_response'" do
        result
        assert scraper.last_instrumentation
        assert_equal 'process_response.scraper', scraper.last_instrumentation[:event]
        assert_equal response, scraper.last_instrumentation[:payload][:response]
      end

      context "the pipeline document" do
        it "is the parsed response body" do
          response.body = 'body'
          stub(scraper.pipeline).call { |arg| @arg = arg }
          mock.proxy(Docs::Parser).new('body') { |parser| stub(parser).html { 'html' } }
          result
          assert_equal 'html', @arg
        end
      end

      context "the pipeline context" do
        let :context do
          stub(scraper.pipeline).call { |_, arg| @arg = arg }
          result
          @arg
        end

        it "includes #options" do
          stub(scraper).options { { test: true } }
          assert context[:test]
        end

        it "includes the response url" do
          response.url = 'url'
          assert_equal 'url', context[:url]
        end
      end
    end

    context "when the response isn't processable" do
      before do
        stub(scraper).process_response?(response) { false }
      end

      it "doesn't run the pipeline" do
        dont_allow(scraper.pipeline).call
        result
      end

      it "returns nil" do
        assert_nil result
      end

      it "instruments 'ignore_response'" do
        result
        assert scraper.last_instrumentation
        assert_equal 'ignore_response.scraper', scraper.last_instrumentation[:event]
        assert_equal response, scraper.last_instrumentation[:payload][:response]
      end
    end
  end

  describe "#pipeline" do
    let :pipeline do
      scraper.pipeline
    end

    it "returns a memoized HTML::Pipeline" do
      assert_instance_of ::HTML::Pipeline, pipeline
      assert_same pipeline, scraper.pipeline
    end

    it "returns a pipeline with the filters stored in .filters" do
      stub(Scraper).filters { [1] }
      assert_equal Scraper.filters, pipeline.filters
    end

    it "returns a pipeline with Docs as instrumentation service" do
      assert_equal Docs, pipeline.instrumentation_service
    end
  end
end