Validating Language Sample Database Implementation in GitHub Linguist
This test suite validates the Linguist sample database integrity and language detection functionality. It ensures proper sample file organization and verifies language definitions match the configured rules in languages.yml.
Test Coverage Overview
Implementation Analysis
Technical Details
Best Practices Demonstrated
github-linguist/linguist
test/test_samples.rb
require_relative "./helper"
require "tempfile"
class TestSamples < Minitest::Test
include Linguist
def test_up_to_date
assert serialized = Samples.cache
assert latest = Samples.data
# Just warn, it shouldn't scare people off by breaking the build.
if serialized['sha256'] != latest['sha256']
warn "Samples database is out of date. Run `bundle exec rake samples`."
expected = Tempfile.new('expected.json')
expected.write Yajl.dump(serialized, :pretty => true)
expected.close
actual = Tempfile.new('actual.json')
actual.write Yajl.dump(latest, :pretty => true)
actual.close
expected.unlink
actual.unlink
end
end
def test_verify
assert data = Samples.cache
assert !data["vocabulary"].empty?
assert !data["icf"].empty?
assert !data["centroids"].empty?
assert_equal data["icf"].size, data["vocabulary"].size
assert !data["extnames"].empty?
assert !data["interpreters"].empty?
assert !data["filenames"].empty?
end
def test_ext_or_shebang
Samples.each do |sample|
if sample[:extname].to_s.empty? && !sample[:filename]
assert sample[:interpreter], "#{sample[:path]} should have a file extension or a shebang, maybe it belongs in filenames/ subdir"
end
end
end
def test_filename_listed
Samples.each do |sample|
if sample[:filename]
listed_filenames = Language[sample[:language]].filenames
listed_filenames -= ["HOSTS"] if ["Hosts File", "INI"].include?(sample[:language])
assert_includes listed_filenames, sample[:filename], "#{sample[:path]} isn't listed as a filename for #{sample[:language]} in languages.yml"
end
end
end
# Check that there aren't samples with extensions or interpreters that
# aren't explicitly defined in languages.yml
languages_yml = File.expand_path("../../lib/linguist/languages.yml", __FILE__)
YAML.load_file(languages_yml).each do |name, options|
define_method "test_samples_have_parity_with_languages_yml_for_#{name}" do
options['extensions'] ||= []
if extnames = Samples.cache['extnames'][name]
extnames.each do |extname|
assert options['extensions'].index { |x| x.downcase.end_with? extname.downcase }, "#{name} has a sample with extension (#{extname.downcase}) that isn't explicitly defined in languages.yml"
end
end
options['interpreters'] ||= []
if interpreters = Samples.cache['interpreters'][name]
interpreters.each do |interpreter|
assert options['interpreters'].include?(interpreter),
"#{name} has a sample with an interpreter (#{interpreter}) that isn't explicitly defined in languages.yml"
end
end
end
end
# If a language extension isn't globally unique then make sure there are samples
Linguist::Language.all.each do |language|
define_method "test_#{language.name}_has_samples" do
language.extensions.each do |extension|
language_matches = Language.find_by_extension(extension)
# Check for samples if more than one language matches the given extension.
if language_matches.length > 1
language_matches.each do |match|
generic = Strategy::Extension.generic? extension
samples = generic ? "test/fixtures/Generic/#{extension.sub(/^\./, "")}/#{match.name}/*" : "samples/#{match.name}/*#{case_insensitive_glob(extension)}"
assert Dir.glob(samples).any?, "Missing samples in #{samples.inspect}. See https://github.com/github/linguist/blob/master/CONTRIBUTING.md"
end
end
end
language.filenames.each do |filename|
# Kludge for an unusual edge-case; see https://bit.ly/41EyUkU
next if ["Hosts File", "INI"].include?(language.name) && filename == "HOSTS"
# Check for samples if more than one language matches the given filename
if Language.find_by_filename(filename).size > 1
sample = "samples/#{language.name}/filenames/#{filename}"
assert File.exist?(sample),
"Missing sample in #{sample.inspect}. See https://github.com/github/linguist/blob/master/CONTRIBUTING.md"
end
end
end
end
def case_insensitive_glob(extension)
glob = ""
extension.each_char do |c|
glob += c.downcase != c.upcase ? "[#{c.downcase}#{c.upcase}]" : c
end
glob
end
end