Back to Repositories

Testing Linguist Repository Language Detection and Git Attributes in github-linguist

This test suite validates core functionality of the Linguist repository class, focusing on language detection, file analysis, and Git attribute handling. It includes comprehensive tests for repository statistics, language identification, and file classification features.

Test Coverage Overview

The test suite provides extensive coverage of the Linguist repository’s core functions including:
  • Language detection and statistics calculation
  • File breakdown analysis and categorization
  • Git attributes processing and overrides
  • Incremental repository updates
  • Empty repository edge cases

Implementation Analysis

The testing approach uses Ruby’s Minitest framework with a structured class hierarchy. It implements two main test classes: TestRuggedRepository for full repository testing and TestEmptyRepository for edge cases. The tests utilize mocking through a custom EmptyRepository class and leverage Git commit hashes for version-specific testing.

Technical Details

Key technical components include:
  • Minitest as the testing framework
  • Rugged for Git repository interaction
  • Custom fixture repositories and commit hashes
  • GitAttributes integration testing
  • UTF-8 encoding validation

Best Practices Demonstrated

The test suite exemplifies several testing best practices:
  • Isolated test cases with clear setup and teardown
  • Comprehensive edge case handling
  • Version-specific test scenarios
  • Proper test organization and naming
  • Effective use of assertions and refutations

github-linguist/linguist

test/test_repository.rb

            
require_relative "./helper"

class TestRuggedRepository < Minitest::Test
  def master_oid
    '7dbcffcf982e766fc711e633322de848f2b60ba5'
  end

  def linguist_repo(oid = master_oid)
    Linguist::Repository.new(source_repository, oid)
  end

  def source_repository
    @rugged ||= Rugged::Repository.new(File.expand_path("../../.git", __FILE__))
  end

  def test_linguist_language
    assert_equal 'Ruby', linguist_repo.language
  end

  def test_linguist_languages
    assert linguist_repo.languages['Ruby'] > 10_000
  end

  def test_linguist_size
    assert linguist_repo.size > 30_000
  end

  def test_linguist_breakdown
    assert linguist_repo.breakdown_by_file.has_key?("Ruby")
    assert linguist_repo.breakdown_by_file["Ruby"].include?("bin/github-linguist")
    assert linguist_repo.breakdown_by_file["Ruby"].include?("lib/linguist/language.rb")
  end

  def test_incremental_stats
    old_commit = '3d7364877d6794f6cc2a86b493e893968a597332'
    old_repo = linguist_repo(old_commit)

    assert old_repo.languages['Ruby'] > 10_000
    assert old_repo.size > 30_000

    new_repo = Linguist::Repository.incremental(source_repository, master_oid, old_commit, old_repo.cache)

    assert new_repo.languages['Ruby'] > old_repo.languages['Ruby']
    assert new_repo.size > old_repo.size

    assert_equal linguist_repo.cache, new_repo.cache
  end

  def test_repo_git_attributes
    # See https://github.com/github/linguist/blob/72a89fc9dcd3585250056ab591f9d7e2411d5fa1/.gitattributes
    #
    # It looks like this:
    # Gemfile linguist-vendored=true
    # lib/linguist.rb linguist-language=Java
    # test/*.rb linguist-language=Java
    # Rakefile linguist-generated
    # test/fixtures/** linguist-vendored=false
    # README.md linguist-documentation=false
    # samples/Arduino/* linguist-documentation
    # samples/Markdown/*.md linguist-detectable=true
    # samples/HTML/*.html linguist-detectable=false
    # samples/CSS/bootstrap.css -linguist-vendored
    # samples/CSS/bootstrap.min.css -linguist-generated
    # LICENSE -linguist-documentation
    # samples/CoffeeScript/browser.coffee -linguist-detectable

    attr_commit = '72a89fc9dcd3585250056ab591f9d7e2411d5fa1'
    repo = linguist_repo(attr_commit)

    assert repo.breakdown_by_file.has_key?("Java")
    assert repo.breakdown_by_file["Java"].include?("lib/linguist.rb")

    assert repo.breakdown_by_file.has_key?("Ruby")
    assert !repo.breakdown_by_file["Ruby"].empty?

    # Ensures the filename that contains unicode char is UTF-8 encoded and invalid chars scrubbed
    assert repo.breakdown_by_file.has_key?("Raku")
    assert repo.breakdown_by_file["Raku"].include?("test/fixtures/ba�r/file_ã.pl")
    assert_equal "UTF-8", repo.breakdown_by_file["Raku"].first.encoding.to_s
    assert repo.breakdown_by_file["Raku"].first.valid_encoding?
  end

  def test_commit_with_git_attributes_data
    # Before we had any .gitattributes data
    old_commit = '4a017d9033f91b2776eb85275463f9613cc371ef'
    old_repo = linguist_repo(old_commit)

    # With some .gitattributes data
    attr_commit = '7ee006cbcb2d7261f9e648510a684ee9ac64126b'
    # It's incremental but now is scanning more data and should bust the cache
    new_repo = Linguist::Repository.incremental(source_repository, attr_commit, old_commit, old_repo.cache, 350_000)

    assert new_repo.breakdown_by_file["Java"].include?("lib/linguist.rb")
  end

  def test_linguist_override_vendored?
    attr_commit = '72a89fc9dcd3585250056ab591f9d7e2411d5fa1'
    linguist_repo(attr_commit).repository.set_attribute_source(attr_commit)

    override_vendored = Linguist::LazyBlob.new(source_repository, attr_commit, 'Gemfile')

    # overridden .gitattributes
    assert override_vendored.vendored?
  end

  def test_linguist_override_unvendored?
    attr_commit = '01d6b9c637a7a6581fe456c600725b68f355b295'
    linguist_repo(attr_commit).repository.set_attribute_source(attr_commit)

    # lib/linguist/vendor.yml defines this as vendored.
    override_unvendored = Linguist::LazyBlob.new(source_repository, attr_commit, 'test/fixtures/foo.rb')
    # test -linguist-vendored attribute method
    override_unvendored_minus = Linguist::LazyBlob.new(source_repository, attr_commit, 'samples/CSS/bootstrap.css')

    # overridden .gitattributes
    refute override_unvendored.vendored?
    refute override_unvendored_minus.vendored?
  end

  def test_linguist_override_documentation?
    attr_commit = "01d6b9c637a7a6581fe456c600725b68f355b295"
    linguist_repo(attr_commit).repository.set_attribute_source(attr_commit)

    readme = Linguist::LazyBlob.new(source_repository, attr_commit, "README.md")
    arduino = Linguist::LazyBlob.new(source_repository, attr_commit, "samples/Arduino/hello.ino")
    # test -linguist-documentation attribute method
    minus = Linguist::LazyBlob.new(source_repository, attr_commit, "LICENSE")

    # overridden by .gitattributes
    refute_predicate readme, :documentation?
    assert_predicate arduino, :documentation?
    refute_predicate minus, :documentation?
  end

  def test_linguist_override_generated?
    attr_commit = "01d6b9c637a7a6581fe456c600725b68f355b295"
    linguist_repo(attr_commit).repository.set_attribute_source(attr_commit)

    rakefile = Linguist::LazyBlob.new(source_repository, attr_commit, "Rakefile")
    # test  -linguist-generated attribute method
    minus = Linguist::LazyBlob.new(source_repository, attr_commit, "samples/CSS/bootstrap.min.css")
    # overridden .gitattributes
    assert rakefile.generated?
    refute minus.generated?
  end

  def test_linguist_override_detectable?
    attr_commit = "01d6b9c637a7a6581fe456c600725b68f355b295"
    linguist_repo(attr_commit).repository.set_attribute_source(attr_commit)

    # markdown is overridden by .gitattributes to be detectable, html to not be detectable
    markdown = Linguist::LazyBlob.new(source_repository, attr_commit, "samples/Markdown/tender.md")
    html = Linguist::LazyBlob.new(source_repository, attr_commit, "samples/HTML/pages.html")
    # test  -linguist-detectable attribute method
    minus = Linguist::LazyBlob.new(source_repository, attr_commit, "samples/CoffeeScript/browser.coffee")

    assert_predicate markdown, :detectable?
    refute_predicate html, :detectable?
    refute_predicate minus, :detectable?
  end

  def test_read_index
    attr_commit = '72a89fc9dcd3585250056ab591f9d7e2411d5fa1'
    repo = linguist_repo(attr_commit)
    repo.read_index

    expected_tree = '9dd86972f2d3caa295588b329f9f195bcb409204'
    assert_equal expected_tree, @rugged.index.write_tree
  end

  def test_current_tree
    repo = linguist_repo

    expected_tree = 'f6cb65aeaee0b206b961746175ecaf4449f73c56'
    assert_equal expected_tree, repo.current_tree.oid
  end
end

################################################################################

class TestEmptyRepository < Minitest::Test
  def source_repository
    @source ||= EmptyRepository.new
  end

  def linguist_repo
    Linguist::Repository.new(source_repository, "1234567890123456789012345678901234567890")
  end

  def test_linguist_language
    assert_nil linguist_repo.language
  end

  def test_linguist_size
    assert_equal 0, linguist_repo.size
  end

  def test_read_index_raises_error
    assert_raises(NotImplementedError) { linguist_repo.read_index }
  end

  def test_current_tree_raises_error
    assert_raises(NotImplementedError) { linguist_repo.current_tree }
  end
end

class EmptyRepository < Linguist::Source::Repository
  class Diff < Linguist::Source::Diff
    def each_delta(&block)
      [].each(&block)
    end
  end

  def get_tree_size(commit_id, limit)
    0
  end

  def set_attribute_source(commit_id)
  end

  def load_attributes_for_path(path, attr_names)
    {}
  end

  def load_blob(blob_id, max_size)
    ["", 0]
  end

  def diff(old_commit, new_commit)
    Diff.new
  end
end