Testing Benchmark Workflow API Integration in AutoGPT
This test suite validates the benchmark workflow functionality in AutoGPT, focusing on file operations and task evaluation through REST API endpoints. It ensures proper task creation, step execution, and evaluation processes work as expected.
Test Coverage Overview
Implementation Analysis
Technical Details
Best Practices Demonstrated
significant-gravitas/autogpt
classic/benchmark/tests/test_benchmark_workflow.py
import datetime
import time
import pytest
import requests
URL_BENCHMARK = "http://localhost:8080/ap/v1"
URL_AGENT = "http://localhost:8000/ap/v1"
try:
response = requests.get(f"{URL_AGENT}/agent/tasks")
except requests.exceptions.ConnectionError:
pytest.skip("No agent available to test against", allow_module_level=True)
@pytest.mark.parametrize(
"eval_id, input_text, expected_artifact_length, test_name, should_be_successful",
[
(
"021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
"Write the word 'Washington' to a .txt file",
0,
"WriteFile",
True,
),
(
"f219f3d3-a41b-45a9-a3d0-389832086ee8",
"Read the file called file_to_read.txt "
"and write its content to a file called output.txt",
1,
"ReadFile",
False,
),
],
)
def test_entire_workflow(
eval_id: str,
input_text: str,
expected_artifact_length: int,
test_name: str,
should_be_successful: bool,
):
task_request = {"eval_id": eval_id, "input": input_text}
response = requests.get(f"{URL_AGENT}/agent/tasks")
task_count_before = response.json()["pagination"]["total_items"]
# First POST request
task_response_benchmark = requests.post(
URL_BENCHMARK + "/agent/tasks", json=task_request
)
response = requests.get(f"{URL_AGENT}/agent/tasks")
task_count_after = response.json()["pagination"]["total_items"]
assert task_count_after == task_count_before + 1
timestamp_after_task_eval_created = datetime.datetime.now(datetime.timezone.utc)
time.sleep(1.1) # To make sure the 2 timestamps to compare are different
assert task_response_benchmark.status_code == 200
task_response_benchmark = task_response_benchmark.json()
assert task_response_benchmark["input"] == input_text
task_response_benchmark_id = task_response_benchmark["task_id"]
response_task_agent = requests.get(
f"{URL_AGENT}/agent/tasks/{task_response_benchmark_id}"
)
assert response_task_agent.status_code == 200
response_task_agent = response_task_agent.json()
assert len(response_task_agent["artifacts"]) == expected_artifact_length
step_request = {"input": input_text}
step_response = requests.post(
URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/steps",
json=step_request,
)
assert step_response.status_code == 200
step_response = step_response.json()
assert step_response["is_last"] is True # Assuming is_last is always True
eval_response = requests.post(
URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/evaluations",
json={},
)
assert eval_response.status_code == 200
eval_response = eval_response.json()
print("eval_response")
print(eval_response)
assert eval_response["run_details"]["test_name"] == test_name
assert eval_response["metrics"]["success"] == should_be_successful
benchmark_start_time = datetime.datetime.fromisoformat(
eval_response["run_details"]["benchmark_start_time"]
)
assert benchmark_start_time < timestamp_after_task_eval_created