Testing Chunk Localization and Text Segmentation in OpenHands
This test suite validates the chunk localization functionality in OpenHands, focusing on text chunk creation, manipulation, and matching algorithms. The tests ensure reliable text segmentation and comparison capabilities using normalized Longest Common Subsequence (LCS) calculations.
Test Coverage Overview
Implementation Analysis
Technical Details
Best Practices Demonstrated
all-hands-ai/openhands
tests/unit/test_chunk_localizer.py
import pytest
from openhands.utils.chunk_localizer import (
Chunk,
create_chunks,
get_top_k_chunk_matches,
normalized_lcs,
)
def test_chunk_creation():
chunk = Chunk(text='test chunk', line_range=(1, 1))
assert chunk.text == 'test chunk'
assert chunk.line_range == (1, 1)
assert chunk.normalized_lcs is None
def test_chunk_visualization(capsys):
chunk = Chunk(text='line1
line2', line_range=(1, 2))
assert chunk.visualize() == '1|line1
2|line2
'
def test_create_chunks_raw_string():
text = 'line1
line2
line3
line4
line5'
chunks = create_chunks(text, size=2)
assert len(chunks) == 3
assert chunks[0].text == 'line1
line2'
assert chunks[0].line_range == (1, 2)
assert chunks[1].text == 'line3
line4'
assert chunks[1].line_range == (3, 4)
assert chunks[2].text == 'line5'
assert chunks[2].line_range == (5, 5)
def test_normalized_lcs():
chunk = 'abcdef'
edit_draft = 'abcxyz'
assert normalized_lcs(chunk, edit_draft) == 0.5
def test_get_top_k_chunk_matches():
text = 'chunk1
chunk2
chunk3
chunk4'
query = 'chunk2'
matches = get_top_k_chunk_matches(text, query, k=2, max_chunk_size=1)
assert len(matches) == 2
assert matches[0].text == 'chunk2'
assert matches[0].line_range == (2, 2)
assert matches[0].normalized_lcs == 1.0
assert matches[1].text == 'chunk1'
assert matches[1].line_range == (1, 1)
assert matches[1].normalized_lcs == 5 / 6
assert matches[0].normalized_lcs > matches[1].normalized_lcs
def test_create_chunks_with_empty_lines():
text = 'line1
line3
line6'
chunks = create_chunks(text, size=2)
assert len(chunks) == 3
assert chunks[0].text == 'line1
'
assert chunks[0].line_range == (1, 2)
assert chunks[1].text == 'line3
'
assert chunks[1].line_range == (3, 4)
assert chunks[2].text == '
line6'
assert chunks[2].line_range == (5, 6)
def test_create_chunks_with_large_size():
text = 'line1
line2
line3'
chunks = create_chunks(text, size=10)
assert len(chunks) == 1
assert chunks[0].text == text
assert chunks[0].line_range == (1, 3)
def test_create_chunks_with_last_chunk_smaller():
text = 'line1
line2
line3'
chunks = create_chunks(text, size=2)
assert len(chunks) == 2
assert chunks[0].text == 'line1
line2'
assert chunks[0].line_range == (1, 2)
assert chunks[1].text == 'line3'
assert chunks[1].line_range == (3, 3)
def test_normalized_lcs_edge_cases():
assert normalized_lcs('', '') == 0.0
assert normalized_lcs('a', '') == 0.0
assert normalized_lcs('', 'a') == 0.0
assert normalized_lcs('abcde', 'ace') == 0.6
def test_get_top_k_chunk_matches_with_ties():
text = 'chunk1
chunk2
chunk3
chunk1'
query = 'chunk'
matches = get_top_k_chunk_matches(text, query, k=3, max_chunk_size=1)
assert len(matches) == 3
assert all(match.normalized_lcs == 5 / 6 for match in matches)
assert {match.text for match in matches} == {'chunk1', 'chunk2', 'chunk3'}
def test_get_top_k_chunk_matches_with_large_k():
text = 'chunk1
chunk2
chunk3'
query = 'chunk'
matches = get_top_k_chunk_matches(text, query, k=10, max_chunk_size=1)
assert len(matches) == 3 # Should return all chunks even if k is larger
@pytest.mark.parametrize('chunk_size', [1, 2, 3, 4])
def test_create_chunks_different_sizes(chunk_size):
text = 'line1
line2
line3
line4'
chunks = create_chunks(text, size=chunk_size)
assert len(chunks) == (4 + chunk_size - 1) // chunk_size
assert sum(len(chunk.text.split('
')) for chunk in chunks) == 4
def test_chunk_visualization_with_special_characters():
chunk = Chunk(text='line1
line2\t
line3\r', line_range=(1, 3))
assert chunk.visualize() == '1|line1
2|line2\t
3|line3\r
'
def test_normalized_lcs_with_unicode():
chunk = 'Hello, 世界!'
edit_draft = 'Hello, world!'
assert 0 < normalized_lcs(chunk, edit_draft) < 1
def test_get_top_k_chunk_matches_with_overlapping_chunks():
text = 'chunk1
chunk2
chunk3
chunk4'
query = 'chunk2
chunk3'
matches = get_top_k_chunk_matches(text, query, k=2, max_chunk_size=2)
assert len(matches) == 2
assert matches[0].text == 'chunk1
chunk2'
assert matches[0].line_range == (1, 2)
assert matches[1].text == 'chunk3
chunk4'
assert matches[1].line_range == (3, 4)
assert matches[0].normalized_lcs == matches[1].normalized_lcs