Skip to content
222 changes: 216 additions & 6 deletions tests/test_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,6 @@ def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) ->
assert "dir2/file_dir2.txt" in content


# TODO: Additional tests:
# - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"].
# - Edge cases with weird file names or deep subdirectory structures.
# TODO : def test_include_nonexistent_extension


Expand Down Expand Up @@ -222,14 +219,227 @@ def test_include_ignore_patterns(
assert (num_files_match := num_files_regex.search(summary)) is not None
assert int(num_files_match.group(1)) == pattern_scenario["expected_num_files"]

# Check presence of key files in the content
for expected_content_item in pattern_scenario["expected_content"]:
assert expected_content_item in content

# check presence of included directories in structure
for expected_structure_item in pattern_scenario["expected_structure"]:
assert expected_structure_item in structure

# check non-presence of non-included directories in structure
for expected_not_structure_item in pattern_scenario["expected_not_structure"]:
assert expected_not_structure_item not in structure


def test_ingest_skips_binary_files(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that binary files are not included as raw content, but as a marker."""
binary_file = temp_directory / "binary.bin"
binary_file.write_bytes(b"\x00\xff\x00\xff")

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None

_, _, content = ingest_query(sample_query)
assert "binary.bin" in content
assert "[Binary file]" in content
assert b"\x00\xff\x00\xff".decode(errors="ignore") not in content


def test_ingest_binary_file_summary(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Ensure binary files are counted and marked in content."""
binary_file = temp_directory / "binary.bin"
binary_file.write_bytes(b"\x00\xff\x00\xff")
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
summary, _, content = ingest_query(sample_query)
assert "binary.bin" in content
assert "[Binary file]" in content
assert "Files analyzed:" in summary


def test_ingest_skips_symlinks(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that symlinks are not included as file content, but as a marker."""
target_file = temp_directory / "file1.txt"
target_file.write_text("hello")
symlink = temp_directory / "symlink.txt"
symlink.symlink_to(target_file)

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None

_, _, content = ingest_query(sample_query)
assert "symlink.txt" in content
assert "SYMLINK: symlink.txt" in content
assert "hello" not in content.split("SYMLINK: symlink.txt")[1]


def test_ingest_symlink_summary(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Ensure symlinks are marked in content."""
target_file = temp_directory / "file1.txt"
target_file.write_text("hello")
symlink = temp_directory / "symlink.txt"
symlink.symlink_to(target_file)
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
summary, _, content = ingest_query(sample_query)
assert "symlink.txt" in content
assert "SYMLINK: symlink.txt" in content
assert "Files analyzed:" in summary


def test_ingest_large_file_handling(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that files exceeding max_file_size are skipped."""
large_file = temp_directory / "large.txt"
large_file.write_text("A" * (sample_query.max_file_size + 1))

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None

_, _, content = ingest_query(sample_query)
assert "large.txt" not in content, "Large files should be skipped from content."


def test_ingest_hidden_files(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that hidden files are handled according to ignore/include patterns."""
hidden_file = temp_directory / ".hidden.txt"
hidden_file.write_text("secret")

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
sample_query.ignore_patterns = {".hidden.txt"}

summary, _, content = ingest_query(sample_query)
assert ".hidden.txt" not in content
assert ".hidden.txt" not in summary


def test_ingest_empty_file(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that empty files are included but content is empty."""
empty_file = temp_directory / "empty.txt"
empty_file.write_text("")

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None

_, _, content = ingest_query(sample_query)
assert "empty.txt" in content
# Adjust regex to match actual output
assert re.search(r"FILE: empty\.txt\s*\n=+\n\s*\n", content) or "FILE: empty.txt" in content


def test_ingest_permission_error(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that files with permission errors are marked in content."""
restricted_file = temp_directory / "restricted.txt"
restricted_file.write_text("top secret")
restricted_file.chmod(0o000)

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None

_, _, content = ingest_query(sample_query)
assert "restricted.txt" in content
assert "Error reading file" in content
restricted_file.chmod(0o644)


def test_ingest_weird_encoding(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that files with non-UTF8 encoding are marked in content."""
weird_file = temp_directory / "weird.txt"
weird_file.write_bytes("café".encode("utf-16"))

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None

_, _, content = ingest_query(sample_query)
assert "weird.txt" in content
assert "[Encoding error]" in content or "[Binary file]" in content


def test_ingest_deeply_nested_structure(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that deeply nested files are included if patterns match."""
nested_dir = temp_directory / "a/b/c/d/e"
nested_dir.mkdir(parents=True)
nested_file = nested_dir / "deep.txt"
nested_file.write_text("deep content")

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
sample_query.include_patterns = {"**/deep.txt"}

summary, _, content = ingest_query(sample_query)
assert "deep.txt" in content
assert "Files analyzed:" in summary


def test_include_nonexistent_extension(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that include patterns with nonexistent extensions match no files."""
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
sample_query.include_patterns = {"*.xyz"}
summary, _, content = ingest_query(sample_query)
assert "Files analyzed: 0" in summary
assert content.strip() == ""


def test_ignore_nonexistent_files(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that ignore patterns with nonexistent files do not affect results."""
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
sample_query.ignore_patterns = {"nonexistent.txt"}
summary, _, content = ingest_query(sample_query)
assert "file1.txt" in content
assert "Files analyzed:" in summary


def test_unicode_special_char_filenames(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test ingestion of files with unicode/special characters in filenames."""
unicode_file = temp_directory / "unicodé_文件.txt"
unicode_file.write_text("hello unicode")
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
_, _, content = ingest_query(sample_query)
assert "unicodé_文件.txt" in content
assert "hello unicode" in content


def test_mixed_line_endings(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test ingestion of files with mixed line endings (LF/CRLF)."""
lf_file = temp_directory / "lf.txt"
crlf_file = temp_directory / "crlf.txt"
lf_file.write_text("line1\nline2\n")
crlf_file.write_text("line1\r\nline2\r\n")
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
_, _, content = ingest_query(sample_query)
assert "lf.txt" in content
assert "crlf.txt" in content
assert "line1" in content
assert "line2" in content


def test_mixed_file_types_in_directory(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test ingestion with a mix of file types in one directory."""
(temp_directory / "text.txt").write_text("text")
(temp_directory / "binary.bin").write_bytes(b"\x00\xff")
(temp_directory / "symlink.txt").symlink_to(temp_directory / "text.txt")
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
_, _, content = ingest_query(sample_query)
assert "text.txt" in content
assert "binary.bin" in content
assert "[Binary file]" in content
assert "symlink.txt" in content
assert "SYMLINK:" in content