RAGStack-Lambda

Overview Schema Related Servers Score Discussions

test_csv_extractor.py•5.87 KiB

"""Unit tests for CSV extractor.""" import pytest from ragstack_common.text_extractors.base import ExtractionResult from ragstack_common.text_extractors.csv_extractor import CsvExtractor from tests.fixtures.text_extractor_samples import ( CSV_EMPTY, CSV_MALFORMED, CSV_NO_HEADER, CSV_NUMERIC, CSV_QUOTED_FIELDS, CSV_SEMICOLON, CSV_SINGLE_COLUMN, CSV_STANDARD, CSV_TAB_SEPARATED, ) class TestCsvExtractor: """Tests for CsvExtractor.""" def test_extracts_standard_csv(self): """Test extraction of standard comma-separated CSV.""" extractor = CsvExtractor() result = extractor.extract(CSV_STANDARD.encode(), "data.csv") assert isinstance(result, ExtractionResult) assert result.file_type == "csv" assert "name" in result.markdown.lower() assert "age" in result.markdown.lower() def test_extracts_tab_separated(self): """Test extraction of tab-separated values.""" extractor = CsvExtractor() result = extractor.extract(CSV_TAB_SEPARATED.encode(), "data.tsv") assert result.file_type == "csv" assert "name" in result.markdown.lower() def test_extracts_semicolon_separated(self): """Test extraction of semicolon-separated values.""" extractor = CsvExtractor() result = extractor.extract(CSV_SEMICOLON.encode(), "data.csv") assert result.file_type == "csv" assert "name" in result.markdown.lower() def test_handles_csv_no_header(self): """Test handling of CSV without header row.""" extractor = CsvExtractor() result = extractor.extract(CSV_NO_HEADER.encode(), "data.csv") assert result.file_type == "csv" # Should generate column names like Column1, Column2 assert "Column" in result.markdown or "column" in result.markdown.lower() def test_handles_quoted_fields(self): """Test handling of CSV with quoted fields containing delimiters.""" extractor = CsvExtractor() result = extractor.extract(CSV_QUOTED_FIELDS.encode(), "data.csv") assert result.file_type == "csv" # Should correctly parse "Smith, John" as a single field assert "Smith" in result.markdown def test_detects_numeric_columns(self): """Test detection of numeric column types.""" extractor = CsvExtractor() result = extractor.extract(CSV_NUMERIC.encode(), "data.csv") # Structural metadata should indicate column types has_columns = "columns" in result.structural_metadata has_column_count = "column_count" in result.structural_metadata assert has_columns or has_column_count def test_generates_frontmatter(self): """Test that frontmatter is generated correctly.""" extractor = CsvExtractor() result = extractor.extract(CSV_STANDARD.encode(), "sales.csv") assert result.markdown.startswith("---\n") assert "source_file: sales.csv" in result.markdown assert "file_type: csv" in result.markdown def test_structural_metadata_includes_counts(self): """Test that structural metadata includes row and column counts.""" extractor = CsvExtractor() result = extractor.extract(CSV_STANDARD.encode(), "data.csv") assert "row_count" in result.structural_metadata assert "column_count" in result.structural_metadata assert result.structural_metadata["row_count"] == 5 # 5 data rows assert result.structural_metadata["column_count"] == 4 # name, age, city, email def test_generates_sample_rows_table(self): """Test that sample rows are included as markdown table.""" extractor = CsvExtractor() result = extractor.extract(CSV_STANDARD.encode(), "data.csv") # Should have markdown table markers assert "|" in result.markdown assert "---" in result.markdown def test_malformed_csv_falls_back_to_text(self): """Test that malformed CSV falls back to plain text with warning.""" extractor = CsvExtractor() result = extractor.extract(CSV_MALFORMED.encode(), "broken.csv") # Should still produce output, possibly with warning assert isinstance(result, ExtractionResult) # May have a parse warning # The file type might be 'csv' with a warning or 'txt' def test_single_column_csv(self): """Test handling of single-column CSV.""" extractor = CsvExtractor() result = extractor.extract(CSV_SINGLE_COLUMN.encode(), "single.csv") # Single column is technically valid assert isinstance(result, ExtractionResult) def test_empty_csv(self): """Test handling of empty CSV file.""" extractor = CsvExtractor() result = extractor.extract(CSV_EMPTY.encode(), "empty.csv") assert isinstance(result, ExtractionResult) # Empty file should produce minimal output def test_columns_listed_in_output(self): """Test that column names are listed in the markdown output.""" extractor = CsvExtractor() result = extractor.extract(CSV_STANDARD.encode(), "data.csv") # Should describe columns assert "name" in result.markdown.lower() assert "email" in result.markdown.lower() def test_delimiter_detected_in_metadata(self): """Test that detected delimiter is recorded in metadata.""" extractor = CsvExtractor() result = extractor.extract(CSV_TAB_SEPARATED.encode(), "data.tsv") assert "delimiter" in result.structural_metadata assert result.structural_metadata["delimiter"] == "\t" def test_title_extracted_from_filename(self): """Test title is extracted from filename.""" extractor = CsvExtractor() result = extractor.extract(CSV_STANDARD.encode(), "sales_data_2024.csv") assert result.title == "sales_data_2024" if __name__ == "__main__": pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/HatmanStack/RAGStack-Lambda'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_csv_extractor.py•5.87 KiB