Back to Repositories

Testing Multi-Format File Operations in AutoGPT

This test suite validates file parsing operations across multiple file formats, focusing on text extraction and binary detection capabilities in AutoGPT. It implements comprehensive testing for file handling utilities with various document types including plain text, structured formats, and binary files.

Test Coverage Overview

The test suite provides extensive coverage for file parsing operations across 10 different file formats.

Key functionality tested includes:

Text extraction from various file formats (TXT, CSV, PDF, DOCX, etc.)
Binary file detection
Format-specific parsing validation
Content integrity verification

Edge cases covered include binary vs text file handling and proper cleanup of temporary test files.

Implementation Analysis

The testing approach uses pytest’s parametrize feature to efficiently test multiple file formats with minimal code duplication.

Testing patterns include:

Temporary file creation and cleanup
File content verification
Binary detection validation
Format-specific mock file generation

Framework features utilized include pytest fixtures and parametrized test cases.

Technical Details

Testing tools and libraries:

pytest for test framework
tempfile for temporary file management
Multiple format-specific libraries (docx, yaml, BeautifulSoup)
Custom file operation utilities

Setup includes mock file generation functions for each supported format with proper cleanup mechanisms.

Best Practices Demonstrated

The test suite exemplifies several testing best practices including proper resource cleanup, parametrized testing for code reuse, and comprehensive format coverage.

Notable practices:

Modular test file creation
Consistent verification methodology
Efficient test parametrization
Proper temporary resource management

significant-gravitas/autogpt

classic/forge/forge/utils/test_file_operations.py

            
import json
import logging
import os.path
import tempfile
from pathlib import Path
from xml.etree import ElementTree

import docx
import pytest
import yaml
from bs4 import BeautifulSoup

from .file_operations import decode_textual_file, is_file_binary_fn

logger = logging.getLogger(__name__)

plain_text_str = "Hello, world!"


def mock_text_file():
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
        f.write(plain_text_str)
    return f.name


def mock_csv_file():
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv") as f:
        f.write(plain_text_str)
    return f.name


def mock_pdf_file():
    with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".pdf") as f:
        # Create a new PDF and add a page with the text plain_text_str
        # Write the PDF header
        f.write(b"%PDF-1.7
")
        # Write the document catalog
        f.write(b"1 0 obj
")
        f.write(b"<< /Type /Catalog /Pages 2 0 R >>
")
        f.write(b"endobj
")
        # Write the page object
        f.write(b"2 0 obj
")
        f.write(
            b"<< /Type /Page /Parent 1 0 R /Resources << /Font << /F1 3 0 R >> >> "
            b"/MediaBox [0 0 612 792] /Contents 4 0 R >>
"
        )
        f.write(b"endobj
")
        # Write the font object
        f.write(b"3 0 obj
")
        f.write(
            b"<< /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica-Bold >>
"
        )
        f.write(b"endobj
")
        # Write the page contents object
        f.write(b"4 0 obj
")
        f.write(b"<< /Length 25 >>
")
        f.write(b"stream
")
        f.write(b"BT
/F1 12 Tf
72 720 Td
(Hello, world!) Tj
ET
")
        f.write(b"endstream
")
        f.write(b"endobj
")
        # Write the cross-reference table
        f.write(b"xref
")
        f.write(b"0 5
")
        f.write(b"0000000000 65535 f 
")
        f.write(b"0000000017 00000 n 
")
        f.write(b"0000000073 00000 n 
")
        f.write(b"0000000123 00000 n 
")
        f.write(b"0000000271 00000 n 
")
        f.write(b"trailer
")
        f.write(b"<< /Size 5 /Root 1 0 R >>
")
        f.write(b"startxref
")
        f.write(b"380
")
        f.write(b"%%EOF
")
        f.write(b"\x00")
    return f.name


def mock_docx_file():
    with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".docx") as f:
        document = docx.Document()
        document.add_paragraph(plain_text_str)
        document.save(f.name)
    return f.name


def mock_json_file():
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
        json.dump({"text": plain_text_str}, f)
    return f.name


def mock_xml_file():
    root = ElementTree.Element("text")
    root.text = plain_text_str
    tree = ElementTree.ElementTree(root)
    with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".xml") as f:
        tree.write(f)
    return f.name


def mock_yaml_file():
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".yaml") as f:
        yaml.dump({"text": plain_text_str}, f)
    return f.name


def mock_html_file():
    html = BeautifulSoup(
        "<html>"
        "<head><title>This is a test</title></head>"
        f"<body><p>{plain_text_str}</p></body>"
        "</html>",
        "html.parser",
    )
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".html") as f:
        f.write(str(html))
    return f.name


def mock_md_file():
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md") as f:
        f.write(f"# {plain_text_str}!
")
    return f.name


def mock_latex_file():
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tex") as f:
        latex_str = (
            r"\documentclass{article}"
            r"\begin{document}"
            f"{plain_text_str}"
            r"\end{document}"
        )
        f.write(latex_str)
    return f.name


respective_file_creation_functions = {
    ".txt": mock_text_file,
    ".csv": mock_csv_file,
    ".pdf": mock_pdf_file,
    ".docx": mock_docx_file,
    ".json": mock_json_file,
    ".xml": mock_xml_file,
    ".yaml": mock_yaml_file,
    ".html": mock_html_file,
    ".md": mock_md_file,
    ".tex": mock_latex_file,
}
binary_files_extensions = [".pdf", ".docx"]


@pytest.mark.parametrize(
    "file_extension, c_file_creator",
    respective_file_creation_functions.items(),
)
def test_parsers(file_extension, c_file_creator):
    created_file_path = Path(c_file_creator())
    with open(created_file_path, "rb") as file:
        loaded_text = decode_textual_file(file, os.path.splitext(file.name)[1], logger)

        assert plain_text_str in loaded_text

        should_be_binary = file_extension in binary_files_extensions
        assert should_be_binary == is_file_binary_fn(file)

    created_file_path.unlink()  # cleanup