Back to Repositories

Testing SummaryIndex Document Management in llama_index

This test suite validates the functionality of the SummaryIndex class in the llama_index library, focusing on document indexing, retrieval, and management operations.

Test Coverage Overview

The test suite provides comprehensive coverage of SummaryIndex operations including:

  • Document building and indexing functionality
  • Document refresh and update capabilities
  • Document insertion and deletion operations
  • Multiple document handling
  • Retriever mode implementations

Implementation Analysis

The testing approach uses pytest fixtures and mocking to validate the SummaryIndex implementation.

Key patterns include:
  • Document content verification through node inspection
  • Document ID management and reference tracking
  • Index structure integrity checks
  • Retriever mode validation

Technical Details

Testing infrastructure includes:

  • pytest framework for test organization
  • Custom fixtures for document creation
  • Token text splitter mocking
  • BaseRetriever implementation verification
  • ListRetrieverMode enumeration testing

Best Practices Demonstrated

The test suite exemplifies quality testing practices through:

  • Isolated test cases for specific functionality
  • Comprehensive edge case coverage
  • Clear test method naming and organization
  • Proper setup and teardown patterns
  • Effective use of assertions for validation

run-llama/llama_index

llama-index-core/tests/indices/list/test_index.py

            
"""Test summary index."""

from typing import List

from llama_index.core.base.base_retriever import BaseRetriever
from llama_index.core.indices.list.base import ListRetrieverMode, SummaryIndex
from llama_index.core.schema import Document


def test_build_list(documents: List[Document], patch_token_text_splitter) -> None:
    """Test build list."""
    summary_index = SummaryIndex.from_documents(documents)
    assert len(summary_index.index_struct.nodes) == 4
    # check contents of nodes
    node_ids = summary_index.index_struct.nodes
    nodes = summary_index.docstore.get_nodes(node_ids)
    assert nodes[0].get_content() == "Hello world."
    assert nodes[1].get_content() == "This is a test."
    assert nodes[2].get_content() == "This is another test."
    assert nodes[3].get_content() == "This is a test v2."


def test_refresh_list(documents: List[Document]) -> None:
    """Test build list."""
    # add extra document
    more_documents = [*documents, Document(text="Test document 2")]

    # ensure documents have doc_id
    for i in range(len(more_documents)):
        more_documents[i].doc_id = str(i)  # type: ignore[misc]

    # create index
    summary_index = SummaryIndex.from_documents(more_documents)

    # check that no documents are refreshed
    refreshed_docs = summary_index.refresh_ref_docs(more_documents)
    assert refreshed_docs[0] is False
    assert refreshed_docs[1] is False

    # modify a document and test again
    more_documents = [*documents, Document(text="Test document 2, now with changes!")]
    for i in range(len(more_documents)):
        more_documents[i].doc_id = str(i)  # type: ignore[misc]

    # second document should refresh
    refreshed_docs = summary_index.refresh_ref_docs(more_documents)
    assert refreshed_docs[0] is False
    assert refreshed_docs[1] is True

    test_node = summary_index.docstore.get_node(summary_index.index_struct.nodes[-1])
    assert test_node.get_content() == "Test document 2, now with changes!"


def test_build_list_multiple(patch_token_text_splitter) -> None:
    """Test build list multiple."""
    documents = [
        Document(text="Hello world.
This is a test."),
        Document(text="This is another test.
This is a test v2."),
    ]
    summary_index = SummaryIndex.from_documents(documents)
    assert len(summary_index.index_struct.nodes) == 4
    nodes = summary_index.docstore.get_nodes(summary_index.index_struct.nodes)
    # check contents of nodes
    assert nodes[0].get_content() == "Hello world."
    assert nodes[1].get_content() == "This is a test."
    assert nodes[2].get_content() == "This is another test."
    assert nodes[3].get_content() == "This is a test v2."


def test_list_insert(documents: List[Document], patch_token_text_splitter) -> None:
    """Test insert to list."""
    summary_index = SummaryIndex([])
    assert len(summary_index.index_struct.nodes) == 0
    summary_index.insert(documents[0])
    nodes = summary_index.docstore.get_nodes(summary_index.index_struct.nodes)
    # check contents of nodes
    assert nodes[0].get_content() == "Hello world."
    assert nodes[1].get_content() == "This is a test."
    assert nodes[2].get_content() == "This is another test."
    assert nodes[3].get_content() == "This is a test v2."

    # test insert with ID
    document = documents[0]
    document.doc_id = "test_id"  # type: ignore[misc]
    summary_index = SummaryIndex([])
    summary_index.insert(document)
    # check contents of nodes
    nodes = summary_index.docstore.get_nodes(summary_index.index_struct.nodes)
    # check contents of nodes
    for node in nodes:
        assert node.ref_doc_id == "test_id"


def test_list_delete(documents: List[Document], patch_token_text_splitter) -> None:
    """Test insert to list and then delete."""
    new_documents = [
        Document(text="Hello world.
This is a test.", id_="test_id_1"),
        Document(text="This is another test.", id_="test_id_2"),
        Document(text="This is a test v2.", id_="test_id_3"),
    ]

    summary_index = SummaryIndex.from_documents(new_documents)

    # test ref doc info for three docs
    all_ref_doc_info = summary_index.ref_doc_info
    for idx, ref_doc_id in enumerate(all_ref_doc_info.keys()):
        assert new_documents[idx].doc_id == ref_doc_id

    # delete from documents
    summary_index.delete_ref_doc("test_id_1")
    assert len(summary_index.index_struct.nodes) == 2
    nodes = summary_index.docstore.get_nodes(summary_index.index_struct.nodes)
    assert nodes[0].ref_doc_id == "test_id_2"
    assert nodes[0].get_content() == "This is another test."
    assert nodes[1].ref_doc_id == "test_id_3"
    assert nodes[1].get_content() == "This is a test v2."
    # check that not in docstore anymore
    source_doc = summary_index.docstore.get_document("test_id_1", raise_error=False)
    assert source_doc is None

    summary_index = SummaryIndex.from_documents(new_documents)
    summary_index.delete_ref_doc("test_id_2")
    assert len(summary_index.index_struct.nodes) == 3
    nodes = summary_index.docstore.get_nodes(summary_index.index_struct.nodes)
    assert nodes[0].ref_doc_id == "test_id_1"
    assert nodes[0].get_content() == "Hello world."
    assert nodes[1].ref_doc_id == "test_id_1"
    assert nodes[1].get_content() == "This is a test."
    assert nodes[2].ref_doc_id == "test_id_3"
    assert nodes[2].get_content() == "This is a test v2."


def test_as_retriever(documents: List[Document]) -> None:
    summary_index = SummaryIndex.from_documents(documents)
    default_retriever = summary_index.as_retriever(
        retriever_mode=ListRetrieverMode.DEFAULT
    )
    assert isinstance(default_retriever, BaseRetriever)

    embedding_retriever = summary_index.as_retriever(
        retriever_mode=ListRetrieverMode.EMBEDDING
    )
    assert isinstance(embedding_retriever, BaseRetriever)