Back to Repositories

Testing Language Detection Implementation in GitHub Linguist

This test suite validates language identification and classification functionality in the GitHub Linguist repository. It covers language alias resolution, language ID verification, and language property testing across the supported programming languages.

Test Coverage Overview

The test suite provides comprehensive coverage of language detection and classification features.

Key areas tested include:
  • Language alias resolution and mapping
  • Language ID verification and uniqueness
  • File extension and filename-based language detection
  • Language properties like type, color, and syntax modes
  • Grammar scope validation

Implementation Analysis

The testing approach uses Ruby’s Minitest framework with assertion-based validation. Tests follow a systematic pattern of verifying language attributes and relationships using the Language class API. Key patterns include direct attribute testing, collection validation, and edge case handling for language identification.

Technical Details

Testing tools and configuration:
  • Test Framework: Minitest
  • Test Helper: Custom helper module
  • Data Sources: languages.yml, grammars.yml
  • External Dependencies: CodeMirror, Ace editor modes
  • Test Environment: Ruby runtime

Best Practices Demonstrated

The test suite demonstrates strong testing practices through comprehensive validation of language attributes and relationships. Notable practices include:
  • Systematic verification of all language properties
  • Validation of external dependencies and integrations
  • Thorough edge case testing
  • Clear test organization and naming

github-linguist/linguist

test/test_language.rb

            
require_relative "./helper"

class TestLanguage < Minitest::Test
  include Linguist

  def test_find_by_alias
    assert_equal Language['ASP.NET'], Language.find_by_alias('aspx')
    assert_equal Language['ASP.NET'], Language.find_by_alias('aspx-vb')
    assert_equal Language['ActionScript'], Language.find_by_alias('as3')
    assert_equal Language['ApacheConf'], Language.find_by_alias('apache')
    assert_equal Language['Assembly'], Language.find_by_alias('nasm')
    assert_equal Language['Batchfile'], Language.find_by_alias('bat')
    assert_equal Language['C#'], Language.find_by_alias('c#')
    assert_equal Language['C#'], Language.find_by_alias('csharp')
    assert_equal Language['C'], Language.find_by_alias('c')
    assert_equal Language['C++'], Language.find_by_alias('c++')
    assert_equal Language['C++'], Language.find_by_alias('cpp')
    assert_equal Language['Chapel'], Language.find_by_alias('chpl')
    assert_equal Language['Classic ASP'], Language.find_by_alias('asp')
    assert_equal Language['CoffeeScript'], Language.find_by_alias('coffee')
    assert_equal Language['CoffeeScript'], Language.find_by_alias('coffee-script')
    assert_equal Language['ColdFusion'], Language.find_by_alias('cfm')
    assert_equal Language['Common Lisp'], Language.find_by_alias('common-lisp')
    assert_equal Language['Common Lisp'], Language.find_by_alias('lisp')
    assert_equal Language['Darcs Patch'], Language.find_by_alias('dpatch')
    assert_equal Language['Dart'], Language.find_by_alias('dart')
    assert_equal Language['EdgeQL'], Language.find_by_alias('esdl')
    assert_equal Language['Emacs Lisp'], Language.find_by_alias('elisp')
    assert_equal Language['Emacs Lisp'], Language.find_by_alias('emacs')
    assert_equal Language['Emacs Lisp'], Language.find_by_alias('emacs-lisp')
    assert_equal Language['Gettext Catalog'], Language.find_by_alias('pot')
    assert_equal Language['HTML'], Language.find_by_alias('html')
    assert_equal Language['HTML'], Language.find_by_alias('xhtml')
    assert_equal Language['HTML+ERB'], Language.find_by_alias('html+erb')
    assert_equal Language['HTML+ERB'], Language.find_by_alias('erb')
    assert_equal Language['IRC log'], Language.find_by_alias('irc')
    assert_equal Language['JSON'], Language.find_by_alias('json')
    assert_equal Language['Java Server Pages'], Language.find_by_alias('jsp')
    assert_equal Language['Java'], Language.find_by_alias('java')
    assert_equal Language['JavaScript'], Language.find_by_alias('javascript')
    assert_equal Language['JavaScript'], Language.find_by_alias('js')
    assert_equal Language['Literate Haskell'], Language.find_by_alias('lhs')
    assert_equal Language['Literate Haskell'], Language.find_by_alias('literate-haskell')
    assert_equal Language['Objective-C'], Language.find_by_alias('objc')
    assert_equal Language['OpenEdge ABL'], Language.find_by_alias('openedge')
    assert_equal Language['OpenEdge ABL'], Language.find_by_alias('progress')
    assert_equal Language['OpenEdge ABL'], Language.find_by_alias('abl')
    assert_equal Language['Parrot Internal Representation'], Language.find_by_alias('pir')
    assert_equal Language['PowerShell'], Language.find_by_alias('posh')
    assert_equal Language['Puppet'], Language.find_by_alias('puppet')
    assert_equal Language['Pure Data'], Language.find_by_alias('pure-data')
    assert_equal Language['Raw token data'], Language.find_by_alias('raw')
    assert_equal Language['Ruby'], Language.find_by_alias('rb')
    assert_equal Language['Ruby'], Language.find_by_alias('ruby')
    assert_equal Language['R'], Language.find_by_alias('r')
    assert_equal Language['Scheme'], Language.find_by_alias('scheme')
    assert_equal Language['Shell'], Language.find_by_alias('bash')
    assert_equal Language['Shell'], Language.find_by_alias('sh')
    assert_equal Language['Shell'], Language.find_by_alias('shell')
    assert_equal Language['Shell'], Language.find_by_alias('zsh')
    assert_equal Language['SuperCollider'], Language.find_by_alias('supercollider')
    assert_equal Language['TeX'], Language.find_by_alias('tex')
    assert_equal Language['TypeScript'], Language.find_by_alias('ts')
    assert_equal Language['Vim Script'], Language.find_by_alias('vim')
    assert_equal Language['Vim Script'], Language.find_by_alias('viml')
    assert_equal Language['reStructuredText'], Language.find_by_alias('rst')
    assert_equal Language['X BitMap'], Language.find_by_alias('xbm')
    assert_equal Language['X PixMap'], Language.find_by_alias('xpm')
    assert_equal Language['YAML'], Language.find_by_alias('yml')
    assert_nil Language.find_by_alias(nil)
  end

  # Note these are set by `script/update-ids`. If these tests fail then someone
  # has changed the `language_id` fields set in languages.yml which is almost certainly
  # not what you want to happen (these fields are used in GitHub's search indexes)
  def test_language_ids
    assert_equal 4, Language['ANTLR'].language_id
    assert_equal 54, Language['Ceylon'].language_id
    assert_equal 326, Language['Ruby'].language_id
    assert_equal 421, Language['xBase'].language_id
  end

  def test_find_by_id
    assert_equal Language['Elixir'], Language.find_by_id(100)
    assert_equal Language['Ruby'], Language.find_by_id(326)
    assert_equal Language['xBase'], Language.find_by_id(421)
  end

  def test_groups
    # Test a couple identity cases
    assert_equal Language['Perl'], Language['Perl'].group
    assert_equal Language['Python'], Language['Python'].group
    assert_equal Language['Ruby'], Language['Ruby'].group

    # Test a few special groups
    assert_equal Language['Assembly'], Language['Unix Assembly'].group
    assert_equal Language['C'], Language['OpenCL'].group
    assert_equal Language['Haskell'], Language['Literate Haskell'].group
    assert_equal Language['Java'], Language['Java Server Pages'].group
    assert_equal Language['Python'], Language['NumPy'].group
    assert_equal Language['Shell'], Language['Gentoo Ebuild'].group
    assert_equal Language['Shell'], Language['Gentoo Eclass'].group
    assert_equal Language['Shell'], Language['Tcsh'].group

    # Ensure everyone has a group
    Language.all.each do |language|
      assert language.group, "#{language} has no group"
    end
  end

  def test_popular
    assert Language['Ruby'].popular?
    assert Language['Perl'].popular?
    assert Language['Python'].popular?
    assert Language['Assembly'].unpopular?
    assert Language['Brainfuck'].unpopular?
  end

  def test_programming
    assert_equal :programming, Language['JavaScript'].type
    assert_equal :programming, Language['LSL'].type
    assert_equal :programming, Language['Perl'].type
    assert_equal :programming, Language['PowerShell'].type
    assert_equal :programming, Language['Python'].type
    assert_equal :programming, Language['Ruby'].type
    assert_equal :programming, Language['TypeScript'].type
    assert_equal :programming, Language['Makefile'].type
    assert_equal :programming, Language['SuperCollider'].type
  end

  def test_markup
    assert_equal :markup, Language['HTML'].type
    assert_equal :markup, Language['SCSS'].type
  end

  def test_data
    assert_equal :data, Language['YAML'].type
  end

  def test_prose
    assert_equal :prose, Language['Markdown'].type
    assert_equal :prose, Language['Org'].type
  end

  def test_find_by_name
    assert_nil Language.find_by_name(nil)
    ruby = Language['Ruby']
    assert_equal ruby, Language.find_by_name('Ruby')
  end

  def test_find_all_by_name
    Language.all.each do |language|
      assert_equal language, Language.find_by_name(language.name)
      assert_equal language, Language[language.name]
    end
  end

  def test_find_all_by_alias
    Language.all.each do |language|
      language.aliases.each do |name|
        assert_equal language, Language.find_by_alias(name)
        assert_equal language, Language[name]
      end
    end
  end

  def test_find_by_extension
    assert_equal [], Language.find_by_extension('.factor-rc')
    assert_equal [Language['Limbo'], Language['M'], Language['MATLAB'], Language['MUF'], Language['Mathematica'], Language['Mercury'], Language['Objective-C']], Language.find_by_extension('foo.m')
    assert_equal [Language['Ruby']], Language.find_by_extension('foo.rb')
    assert_equal [Language['Ruby']], Language.find_by_extension('foo/bar.rb')
    assert_equal [Language['Ruby']], Language.find_by_extension('PKGBUILD.rb')
    assert_equal ['C', 'C++', 'Objective-C'], Language.find_by_extension('foo.h').map(&:name).sort
    assert_equal [], Language.find_by_extension('rb')
    assert_equal [], Language.find_by_extension('.null')
    assert_equal [Language['Jinja']], Language.find_by_extension('index.jinja')
    assert_equal [Language['Chapel']], Language.find_by_extension('examples/hello.chpl')
    assert_equal [], Language.find_by_filename('F.I.L.E.')
  end

  def test_find_all_by_extension
    Language.all.each do |language|
      language.extensions.each do |extension|
        assert_includes Language.find_by_extension(extension), language
      end
    end
  end

  def test_find_by_filename
    assert_equal [Language['Shell']], Language.find_by_filename('PKGBUILD')
    assert_equal [Language['Ruby']], Language.find_by_filename('Rakefile')
    assert_equal Language['ApacheConf'], Language.find_by_filename('httpd.conf').first
    assert_equal [Language['ApacheConf']], Language.find_by_filename('.htaccess')
    assert_equal Language['Nginx'], Language.find_by_filename('nginx.conf').first
    assert_equal [], Language.find_by_filename('foo.rb')
    assert_equal [], Language.find_by_filename('rb')
    assert_equal [], Language.find_by_filename('.null')
    assert_equal [Language['Shell']], Language.find_by_filename('.bashrc')
    assert_equal [Language['Shell']], Language.find_by_filename('bash_profile')
    assert_equal [Language['Shell']], Language.find_by_filename('.zshrc')
    assert_equal [Language['Clojure']], Language.find_by_filename('riemann.config')
  end

  def test_find_by_interpreter
    {
      "ruby" => "Ruby",
      "Rscript" => "R",
      "sh" => "Shell",
      "bash" => "Shell",
      "python" => "Python",
      "python2" => "Python",
      "python3" => "Python",
      "sbcl" => "Common Lisp",
      "sclang" => "SuperCollider"
    }.each do |interpreter, language|
      assert_equal [Language[language]], Language.find_by_interpreter(interpreter)
    end

    assert_equal [], Language.find_by_interpreter(nil)
  end

  def test_find
    assert_equal 'Ruby', Language['Ruby'].name
    assert_equal 'Ruby', Language['ruby'].name
    assert_equal 'C++', Language['C++'].name
    assert_equal 'C++', Language['c++'].name
    assert_equal 'C++', Language['cpp'].name
    assert_equal 'C#', Language['C#'].name
    assert_equal 'C#', Language['c#'].name
    assert_equal 'C#', Language['csharp'].name
    assert_nil Language['defunkt']
    assert_nil Language[nil]
  end

  def test_find_ignores_case
    assert_equal 'AGS Script', Language['ags script'].name
    assert_equal 'AGS Script', Language['ags sCRIPT'].name
  end

  def test_find_by_name_ignores_case
    assert_equal 'AGS Script', Language.find_by_name('ags script').name
    assert_equal 'AGS Script', Language.find_by_name('ags sCRIPT').name
  end

  def test_find_by_alias_ignores_case
    refute_includes Language['AGS Script'].aliases, 'AGS'
    assert_equal 'AGS Script', Language.find_by_alias('AGS').name
  end

  def test_find_ignores_comma
    assert_equal 'Rust', Language['rust,no_run'].name
  end

  def test_find_by_name_ignores_comma
    assert_equal Language['Rust'], Language.find_by_name('rust,no_run')
  end

  def test_find_by_alias_ignores_comma
    assert_equal Language['Rust'], Language.find_by_alias('rust,no_run')
  end

  def test_doesnt_blow_up_with_blank_lookup
    assert_nil Language.find_by_alias('')
    assert_nil Language.find_by_name(nil)
    assert_nil Language[""]
  end

  def test_does_not_blow_up_with_non_string_lookup
    assert_nil Language.find_by_alias(true)
    assert_nil Language.find_by_name(true)
    assert_nil Language[true]
  end

  def test_name
    assert_equal 'Perl',   Language['Perl'].name
    assert_equal 'Python', Language['Python'].name
    assert_equal 'Ruby',   Language['Ruby'].name
  end

  def test_escaped_name
    assert_equal 'C', Language['C'].escaped_name
    assert_equal 'C%23', Language['C#'].escaped_name
    assert_equal 'C%2B%2B', Language['C++'].escaped_name
    assert_equal 'Objective-C', Language['Objective-C'].escaped_name
    assert_equal 'Common%20Lisp', Language['Common Lisp'].escaped_name
  end

  def test_error_without_name
    assert_raises ArgumentError do
      Language.new :name => nil
    end
  end

  def test_color
    assert_equal '#701516', Language['Ruby'].color
    assert_equal '#3572A5', Language['Python'].color
    assert_equal '#f1e05a', Language['JavaScript'].color
    assert_equal '#3178c6', Language['TypeScript'].color
    assert_equal '#3d9970', Language['LSL'].color
  end

  def test_colors
    assert Language.colors.include?(Language['Ruby'])
    assert Language.colors.include?(Language['Python'])
  end

  def test_ace_mode
    assert_equal 'c_cpp', Language['C++'].ace_mode
    assert_equal 'coffee', Language['CoffeeScript'].ace_mode
    assert_equal 'csharp', Language['C#'].ace_mode
    assert_equal 'css', Language['CSS'].ace_mode
    assert_equal 'lsl', Language['LSL'].ace_mode
    assert_equal 'javascript', Language['JavaScript'].ace_mode
    assert_equal 'text', Language['FORTRAN'].ace_mode
  end

  def test_codemirror_mode
    assert_equal 'ruby', Language['Ruby'].codemirror_mode
    assert_equal 'javascript', Language['JavaScript'].codemirror_mode
    assert_equal 'clike', Language['C'].codemirror_mode
    assert_equal 'clike', Language['C++'].codemirror_mode
  end

  def test_codemirror_mime_type
    assert_equal 'text/x-ruby', Language['Ruby'].codemirror_mime_type
    assert_equal 'text/javascript', Language['JavaScript'].codemirror_mime_type
    assert_equal 'text/x-csrc', Language['C'].codemirror_mime_type
    assert_equal 'text/x-c++src', Language['C++'].codemirror_mime_type
  end

  def test_wrap
    assert_equal false, Language['C'].wrap
    assert_equal true, Language['Markdown'].wrap
  end

  def test_extensions
    assert Language['LSL'].extensions.include?('.lsl')
    assert Language['Perl'].extensions.include?('.pl')
    assert Language['Python'].extensions.include?('.py')
    assert Language['Ruby'].extensions.include?('.rb')
    assert Language['SuperCollider'].extensions.include?('.scd')
  end

  def test_eql
    assert Language['Ruby'].eql?(Language['Ruby'])
    assert !Language['Ruby'].eql?(Language['Python'])
  end

  def test_by_type
    assert !Language.by_type(:prose).nil?
  end

  def test_all_languages_have_grammars
    scopes = YAML.load(File.read(File.expand_path("../../grammars.yml", __FILE__))).values.flatten
    missing = Language.all.reject { |language| language.tm_scope == "none" || scopes.include?(language.tm_scope) }
    message = "The following languages' scopes are not listed in grammars.yml. Please add grammars for all new languages.
"
    message << "If no grammar exists for a language, mark the language with `tm_scope: none` in lib/linguist/languages.yml.
"

    width = missing.map { |language| language.name.length }.max
    message << missing.map { |language| sprintf("%-#{width}s %s", language.name, language.tm_scope) }.sort.join("
")
    assert missing.empty?, message
  end

  def test_all_languages_have_scopes
    languages = YAML.load(File.read(File.expand_path("../../lib/linguist/languages.yml", __FILE__)))
    missing = languages.reject { |name,language| language.has_key?('tm_scope') }
    message = "The following languages do not have a `tm_scope` field defined. Use `tm_scope: none` if the language has no grammar.
"
    message << missing.keys.sort.join("
")
    assert missing.empty?, message
  end

  def test_all_languages_have_type
    missing = Language.all.select { |language| language.type.nil? }
    message = "The following languages do not have a type listed in grammars.yml. Please add types for all new languages.
"

    width = missing.map { |language| language.name.length }.max
    message << missing.map { |language| sprintf("%-#{width}s", language.name) }.sort.join("
")
    assert missing.empty?, message
  end

  def test_all_languages_have_a_language_id_set
    missing = Language.all.select { |language| language.language_id.nil? }

    message = "The following languages do not have a language_id listed in languages.yml. Please run `script/update-ids` as per the contribution guidelines.
"
    missing.each { |language| message << "#{language.name}
" }
    assert missing.empty?, message
  end

  def test_all_languages_have_a_valid_id
    deleted_language_ids = [21]  # Prevent re-use of deleted language IDs
    invalid = Language.all.select { |language| language.language_id < 0 || deleted_language_ids.include?(language.language_id) || (language.language_id > 431 && language.language_id < 1024) || language.language_id >= (2**31 - 1) }

    message = "The following languages do not have a valid language_id. Please run `script/update-ids` as per the contribution guidelines.
"
    invalid.each { |language| message << "#{language.name}
" }
    assert invalid.empty?, message
  end

  def test_all_language_id_are_unique
    duplicates = Language.all.group_by{ |language| language.language_id }.select { |k, v| v.size > 1 }.map(&:first)

    message = "The following language_id are used several times in languages.yml. Please run `script/update-ids` as per the contribution guidelines.
"
    duplicates.each { |language_id| message << "#{language_id}
" }
    assert duplicates.empty?, message
  end

  def test_all_languages_have_a_valid_ace_mode
    ace_fixture_path = File.join('test', 'fixtures', 'ace_modes.json')
    skip("No ace_modes.json file") unless File.exist?(ace_fixture_path)

    ace_modes = Yajl.load(File.read(ace_fixture_path))
    ace_github_modes = ace_modes[0].concat(ace_modes[1])
    existing_ace_modes = ace_github_modes.map do |ace_github_mode|
      File.basename(ace_github_mode["name"], ".js") if ace_github_mode["name"]  !~ /_highlight_rules|_test|_worker/
    end.compact.uniq.sort.map(&:downcase)

    missing = Language.all.reject { |language| language.ace_mode == "text" || existing_ace_modes.include?(language.ace_mode) }
    message = "The following languages do not have an Ace mode listed in languages.yml. Please add an Ace mode for all new languages.
"
    message << "If no Ace mode exists for a language, mark the language with `ace_mode: text` in lib/linguist/languages.yml.
"

    width = missing.map { |language| language.name.length }.max
    message << missing.map { |language| sprintf("%-#{width}s %s", language.name, language.ace_mode) }.sort.join("
")
    assert missing.empty?, message
  end

  def test_codemirror_modes_present
    Language.all.each do |language|
      if language.codemirror_mode || language.codemirror_mime_type
        assert language.codemirror_mode, "#{language.inspect} missing CodeMirror mode"
        assert language.codemirror_mime_type, "#{language.inspect} missing CodeMirror MIME mode"
      end
    end
  end

  def test_valid_codemirror_mode
    Language.all.each do |language|
      if mode = language.codemirror_mode
        assert File.exist?(File.expand_path("../../vendor/CodeMirror/mode/#{mode}", __FILE__)), "#{mode} isn't a valid CodeMirror mode"
      end
    end
  end

  def test_codemirror_mode_and_mime_defined_by_meta_mapping
    meta = File.read(File.expand_path("../../vendor/CodeMirror/mode/meta.js", __FILE__))
    Language.all.each do |language|
      next unless language.codemirror_mode && language.codemirror_mime_type
      assert meta.match(/^.+#{Regexp.escape(language.codemirror_mime_type)}.+#{Regexp.escape(language.codemirror_mode)}.+$/), "#{language.inspect}: #{language.codemirror_mime_type} not defined under #{language.codemirror_mode}"
    end
  end

  def test_codemirror_mime_declared_in_mode_file
    Language.all.each do |language|
      next unless language.codemirror_mode && language.codemirror_mime_type
      filename = File.expand_path("../../vendor/CodeMirror/mode/#{language.codemirror_mode}/#{language.codemirror_mode}.js", __FILE__)
      assert File.exist?(filename), "#{filename} does not exist"
      assert File.read(filename).match(language.codemirror_mime_type), "#{language.inspect}: #{language.codemirror_mime_type} not defined in #{filename}"
    end
  end

  def test_all_popular_languages_exist
    popular = YAML.load(File.read(File.expand_path("../../lib/linguist/popular.yml", __FILE__)))

    missing = popular - Language.all.map(&:name)
    message = "The following languages are listed in lib/linguist/popular.yml but not in lib/linguist/languages.yml.
"
    message << missing.sort.join("
")
    assert missing.empty?, message
  end

  def test_non_crash_on_comma
    assert_nil Language[',']
    assert_nil Language.find_by_name(',')
    assert_nil Language.find_by_alias(',')
  end

  def test_detect_prefers_markdown_for_md
    blob = Linguist::FileBlob.new(File.join(samples_path, "Markdown/symlink.md"))
    match = Linguist.detect(blob)
    assert_equal Language["Markdown"], match
  end

  def test_fs_names
    Language.all.each do |language|
      next unless /[\\:*?"<>|]/.match(language.name)
      assert language.fs_name, "#{language.name} needs an fs_name for Windows' file system."
      assert !/[\\:*?"<>|]/.match(language.fs_name), "The fs_name for #{language.name} is invalid."
    end
  end
end