Back to Repositories

Validating BERT-SQuAD Performance Optimization in DeepSpeed

This test suite validates the end-to-end functionality of DeepSpeed’s BERT implementation for question answering using the SQuAD dataset. It verifies both base DeepSpeed and ZeRO optimization configurations through comprehensive accuracy testing and performance validation.

Test Coverage Overview

The test suite provides extensive coverage of DeepSpeed’s BERT implementation for SQuAD:

Base DeepSpeed configuration testing with expected exact match of 83.98% and F1 score of 90.71%
ZeRO optimization testing with expected exact match of 84.14% and F1 score of 90.89%
Multi-GPU execution validation using 4 GPUs
Model checkpoint loading and prediction verification

Implementation Analysis

The testing approach utilizes pytest for structured test execution and validation. It implements two primary test cases that verify different DeepSpeed configurations:

Configuration management through dynamic JSON generation
Subprocess handling for bash script execution
Timeout and error handling mechanisms
Numerical validation with tolerance checks

Technical Details

Key technical components include:

pytest framework for test organization
subprocess module for script execution
JSON configuration management
Mathematical comparison utilities
File system operations for temporary directories
Custom evaluation module integration

Best Practices Demonstrated

The test implementation showcases several testing best practices:

Isolated test environments using temporary directories
Proper resource cleanup and process management
Configurable timeout handling
Precise numerical comparisons with appropriate tolerances
Clear separation of configuration and test logic

microsoft/deepspeed

tests/model/BingBertSquad/test_e2e_squad.py

            
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

import subprocess as sp
import os
from math import isclose
import sys
import pytest
import json

sys.path.append("../../../DeepSpeedExamples/training/BingBertSquad")
import evaluate as eval

squad_dir = "/data/BingBertSquad"
base_dir = "../../../DeepSpeedExamples/training/BingBertSquad"

script_file_name = "run_squad_deepspeed.sh"
model_file_name = "training_state_checkpoint_162.tar"
eval_file_name = "dev-v1.1.json"
pred_file_name = "predictions.json"

num_gpus = "4"
timeout_sec = 5 * 60 * 60  # 5 hours

eval_version = "1.1"


def create_config_file(tmpdir, zeroenabled=False):
    config_dict = {
        "train_batch_size": 24,
        "train_micro_batch_size_per_gpu": 6,
        "steps_per_print": 10,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 3e-5,
                "weight_decay": 0.0,
                "bias_correction": False
            }
        },
        "gradient_clipping": 1.0,
        "fp16": {
            "enabled": True
        }
    }
    config_dict["zero_optimization"] = zeroenabled

    config_path = os.path.join(tmpdir, 'temp_config.json')
    with open(config_path, 'w') as fd:
        json.dump(config_dict, fd)
    return config_path


def test_e2e_squad_deepspeed_base(tmpdir):
    config_file = create_config_file(tmpdir)

    # base run results => {"exact_match": 83.9829706717124, "f1": 90.71138132004097}
    expected_exact_match = 83.98
    expected_f1 = 90.71

    model_file = os.path.join(squad_dir, model_file_name)
    eval_file = os.path.join(squad_dir, eval_file_name)

    output_dir = os.path.join(tmpdir, "output")
    pred_file = os.path.join(output_dir, pred_file_name)

    proc = sp.Popen(["bash", script_file_name, num_gpus, model_file, squad_dir, output_dir, config_file], cwd=base_dir)

    try:
        proc.communicate(timeout=timeout_sec)

        if os.path.exists(pred_file):
            eval_result = eval.evaluate(eval_version, eval_file, pred_file)

            print("evaluation result: ", json.dumps(eval_result))

            assert isclose(eval_result["exact_match"], expected_exact_match, abs_tol=1e-2)
            assert isclose(eval_result["f1"], expected_f1, abs_tol=1e-2)

        else:
            pytest.fail("Error: Run Failed")

    except sp.TimeoutExpired:
        proc.kill()
        pytest.fail("Error: Timeout")
    except sp.CalledProcessError:
        pytest.fail("Error: Run Failed")


def test_e2e_squad_deepspeed_zero(tmpdir):
    config_file = create_config_file(tmpdir, True)

    # base run results => {"exact_match": 84.1438032166509, "f1": 90.89776136505441}
    expected_exact_match = 84.14
    expected_f1 = 90.89

    model_file = os.path.join(squad_dir, model_file_name)
    eval_file = os.path.join(squad_dir, eval_file_name)

    output_dir = os.path.join(tmpdir, "output")
    pred_file = os.path.join(output_dir, pred_file_name)

    proc = sp.Popen(["bash", script_file_name, num_gpus, model_file, squad_dir, output_dir, config_file], cwd=base_dir)

    try:
        proc.communicate(timeout=timeout_sec)

        if os.path.exists(pred_file):
            eval_result = eval.evaluate(eval_version, eval_file, pred_file)

            print("evaluation result: ", json.dumps(eval_result))

            assert isclose(eval_result["exact_match"], expected_exact_match, abs_tol=1e-2)
            assert isclose(eval_result["f1"], expected_f1, abs_tol=1e-2)

        else:
            pytest.fail("Error: Run Failed")

    except sp.TimeoutExpired:
        proc.kill()
        pytest.fail("Error: Timeout")
    except sp.CalledProcessError:
        pytest.fail("Error: Run Failed")