Testing CPU-Based Adam Optimizer Implementation in DeepSpeed
This test suite validates the CPU-based Adam optimizer implementation in DeepSpeed, comparing its functionality against PyTorch’s native AdamW and CUDA-based FusedAdam implementations. The tests ensure numerical accuracy and compatibility across different data types and model sizes.
Test Coverage Overview
Implementation Analysis
Technical Details
Best Practices Demonstrated
microsoft/deepspeed
tests/unit/ops/adam/test_cpu_adam.py
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
import numpy as np
import pytest
from cpuinfo import get_cpu_info
import deepspeed
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.adam import FusedAdam
from deepspeed.ops.op_builder import CPUAdamBuilder, FusedAdamBuilder
from unit.common import DistributedTest
if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
pytest.skip("cpu-adam is not compatible", allow_module_level=True)
pytest.cpu_vendor = get_cpu_info()["vendor_id_raw"].lower()
def check_equal(first, second, atol=1e-2, verbose=False):
x = first.detach().float().numpy()
y = second.detach().float().numpy()
print("ATOL", atol)
if verbose:
print("x = {}".format(x.flatten()))
print("y = {}".format(y.flatten()))
print('-' * 80)
np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2):
for i in range(10):
param1.grad = torch.randn(model_size, device=param1.device).to(param1.dtype)
param2.grad = param1.grad.clone().detach().to(device=param2.device, dtype=param2.dtype)
optimizer1.step()
optimizer2.step()
tolerance = param1.float().norm().detach().numpy() * 1e-2
check_equal(param1.float().norm(), param2.float().cpu().norm(), atol=tolerance, verbose=True)
@pytest.mark.parametrize('dtype', [torch.half, torch.bfloat16, torch.float], ids=["fp16", "bf16", "fp32"])
@pytest.mark.parametrize('model_size',
[
(64),
(22),
#(55),
(128),
(1024),
(1048576),
]) # yapf: disable
class TestCPUAdam(DistributedTest):
world_size = 1
reuse_dist_env = True
requires_cuda_env = False
if not get_accelerator().is_available():
init_distributed = False
set_dist_env = False
@pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.")
@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME],
reason="FusedAdam is not compatible")
def test_fused_adam_equal(self, dtype, model_size):
if dtype not in get_accelerator().supported_dtypes():
pytest.skip(f"dtype {dtype} not supported in current accelerator")
if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
from deepspeed.ops.adam import DeepSpeedCPUAdam
cpu_data = torch.randn(model_size, device='cpu').to(dtype)
cpu_param = torch.nn.Parameter(cpu_data)
cuda_param = torch.nn.Parameter(cpu_data.to(get_accelerator().device_name()))
# tolerance = cpu_param.float().norm().detach().numpy() * 1e-2
# check_equal(cpu_param.float().norm(),
# cuda_param.float().cpu().norm(),
# atol=tolerance,
# verbose=True)
cpu_optimizer = DeepSpeedCPUAdam([cpu_param])
cuda_optimizer = FusedAdam([cuda_param])
_compare_optimizers(model_size=model_size,
param1=cpu_param,
optimizer1=cpu_optimizer,
param2=cuda_param,
optimizer2=cuda_optimizer)
def test_torch_adamw_equal(self, dtype, model_size):
if get_accelerator().is_available():
if dtype == torch.half:
pytest.skip("torch.optim.AdamW with half precision inf/nan output.")
if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
ref_param_device = get_accelerator().device_name()
else:
if dtype == torch.half:
pytest.skip("torch.optim.AdamW with half precision only supported in CUDA environments.")
ref_param_device = 'cpu'
from deepspeed.ops.adam import DeepSpeedCPUAdam
cpu_data = torch.randn(model_size, device='cpu').to(dtype)
cpu_param = torch.nn.Parameter(cpu_data)
ref_param = torch.nn.Parameter(cpu_data.to(ref_param_device))
cpu_optimizer = DeepSpeedCPUAdam([cpu_param])
ref_optimizer = torch.optim.AdamW([ref_param])
_compare_optimizers(model_size=model_size,
param1=cpu_param,
optimizer1=cpu_optimizer,
param2=ref_param,
optimizer2=ref_optimizer)
class TestCPUAdamGPUError(DistributedTest):
def test_cpu_adam_gpu_error(self):
model_size = 64
from deepspeed.ops.adam import DeepSpeedCPUAdam
device = get_accelerator().device_name(0) # 'cuda:0' or 'xpu:0'
param = torch.nn.Parameter(torch.randn(model_size, device=device))
optimizer = DeepSpeedCPUAdam([param])
param.grad = torch.randn(model_size, device=device)
with pytest.raises(AssertionError):
optimizer.step()