Founded on September 1, 2020, the Core: Leadership, Infrastructure, Futures division of the American Library Association has a mission to cultivate and amplify the collective expertise of library workers in core functions through community building, advocacy, and learning. + In June 2020, the ALA Council voted to approve Core: Leadership, Infrastructure, Futures as a new ALA division beginning September 1, 2020, and to dissolve the Association for Library Collections and Technical Services (ALCTS), the Library Information Technology Association (LITA) and the Library Leadership and Management Association (LLAMA) effective August 31, 2020. The vote to form Core was 163 to 1.(1)
+#{p.inner_html}
" }.join("\n") + accumulator << html + end +end + +to_field 'bioghist_heading_ssm' do |record, accumulator| + # Extract section heading (matches ArcLight's bioghist_heading_ssm pattern) + heading = record.xpath('//eac:cpfDescription/eac:description/eac:biogHist//eac:head', EAC_NS).first + accumulator << heading.text if heading +end + +# Full-text search field +to_field 'text' do |record, accumulator| + # Title + name = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS) + accumulator << name.map(&:text).join(' ') if name.any? + + # Bioghist + bioghist = record.xpath('//eac:cpfDescription/eac:description/eac:biogHist//eac:p', EAC_NS) + accumulator << bioghist.map(&:text).join(' ') if bioghist.any? +end + +# Related agents (from cpfRelation elements) +to_field 'related_agents_ssim' do |record, accumulator| + relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS) + relations.each do |rel| + # Get the related entity href/identifier + href = rel['href'] || rel['xlink:href'] + relation_type = rel['cpfRelationType'] + + if href + # Store as: "uri|type" for easy parsing later + accumulator << "#{href}|#{relation_type}" + elsif relation_entry = rel.xpath('eac:relationEntry', EAC_NS).first + # If no href, at least store the name + name = relation_entry.text + accumulator << "#{name}|#{relation_type}" if name + end + end +end + +# Related agents - just URIs (for simpler queries) +to_field 'related_agent_uris_ssim' do |record, accumulator| + relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS) + relations.each do |rel| + href = rel['href'] || rel['xlink:href'] + accumulator << href if href + end +end + +# Relationship types +to_field 'relationship_types_ssim' do |record, accumulator| + relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS) + relations.each do |rel| + relation_type = rel['cpfRelationType'] + accumulator << relation_type if relation_type && !accumulator.include?(relation_type) + end +end + +# Agent source URI (from original ArchivesSpace) +to_field 'agent_uri' do |record, accumulator| + # Try to extract from control section or otherRecordId + other_id = record.xpath('//eac:control/eac:otherRecordId[@localType="archivesspace_uri"]', EAC_NS).first + if other_id + accumulator << other_id.text + end +end + +# Timestamp +to_field 'timestamp' do |record, accumulator| + accumulator << Time.now.utc.iso8601 +end + +# Document type marker +to_field 'document_type' do |record, accumulator| + accumulator << 'creator' +end + +# Log successful indexing +each_record do |record, context| + record_id = record.xpath('//eac:control/eac:recordId', EAC_NS).first + if record_id + context.logger.info("Indexed creator: #{record_id.text}") + end +end diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..b04c3c7 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,46 @@ +[pytest] +minversion = 7.0 +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* + +# Output options +addopts = + -v + --strict-markers + --tb=short + --cov=arcflow + --cov-report=term-missing + --cov-report=html + --cov-report=xml + +# Markers for organizing tests +markers = + unit: Unit tests that don't require external dependencies + integration: Integration tests that may require mocked external services + slow: Tests that take significant time to run + skip_complex: Tests that are intentionally skipped due to code complexity + +# Coverage settings +[coverage:run] +source = arcflow +omit = + */tests/* + */test_* + */__pycache__/* + +[coverage:report] +precision = 2 +show_missing = True +skip_covered = False + +# Exclude lines from coverage +exclude_lines = + pragma: no cover + def __repr__ + raise AssertionError + raise NotImplementedError + if __name__ == .__main__.: + if TYPE_CHECKING: + @abstractmethod diff --git a/requirements.txt b/requirements.txt index 6efbe65..7eb5cbc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ ArchivesSnake -pyyaml \ No newline at end of file +pyyaml +pytest>=7.0.0 +pytest-cov>=4.0.0 +pytest-mock>=3.10.0 \ No newline at end of file diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..78c9ccc --- /dev/null +++ b/tests/README.md @@ -0,0 +1,54 @@ +# ArcFlow Test Suite + +This directory contains tests for the ArcFlow project. + +## Test Structure + +- `unit/` - Fast unit tests for individual components +- `conftest.py` - Shared test fixtures and configuration + +## Running Tests + +```bash +# Run all tests +pytest + +# Run only unit tests +pytest tests/unit + +# Run with verbose output +pytest -v + +# Run specific test file +pytest tests/unit/test_traject_smoke.py +``` + +## Traject Smoke Tests + +Tests in `tests/unit/test_traject_smoke.py` verify traject configuration without requiring Solr. + +### What They Test +- Ruby syntax validity of traject configs +- Traject can load and parse configs +- XML transformation logic (without indexing) + +### Setup Requirements +- Ruby 3.1+ +- Bundler +- Run `bundle install` to install traject gem + +### Performance +- First run: ~60 seconds (includes gem install) +- Cached runs: ~2 seconds (gems cached) +- Fast enough for CI/agent iteration + +### Skipping +These tests skip gracefully if traject config doesn't exist yet. + +## Writing Tests + +When adding new tests: +- Use pytest fixtures from `conftest.py` +- Keep unit tests fast (< 1 second each) +- Add integration tests to appropriate subdirectories +- Use `pytest.skip()` for tests that require optional dependencies diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py new file mode 100644 index 0000000..6976fdb --- /dev/null +++ b/tests/unit/conftest.py @@ -0,0 +1,218 @@ +""" +Shared pytest fixtures for arcflow tests. + +Provides common test fixtures including: +- mock_asnake_client: Mock ArchivesSpace client for testing without API calls +- temp_dir: Temporary directory for test file operations +- sample_*: Sample data structures representing ArchivesSpace API responses +""" + +import os +import tempfile +import shutil +import pytest +from unittest.mock import Mock, MagicMock +from datetime import datetime, timezone + + +@pytest.fixture +def temp_dir(): + """Create a temporary directory for test file operations.""" + temp_path = tempfile.mkdtemp() + yield temp_path + # Cleanup after test + if os.path.exists(temp_path): + shutil.rmtree(temp_path) + + +@pytest.fixture +def mock_asnake_client(): + """Create a mock ASnake client for testing.""" + mock_client = Mock() + mock_client.authorize = Mock() + mock_client.config = {'baseurl': 'http://localhost:8089'} + + # Mock the get method to return a mock response + mock_response = Mock() + mock_response.json = Mock(return_value={}) + mock_client.get = Mock(return_value=mock_response) + + return mock_client + + +@pytest.fixture +def sample_repository(): + """Sample repository data from ArchivesSpace API.""" + return { + 'uri': '/repositories/2', + 'name': 'Test Repository', + 'repo_code': 'test_repo', + 'slug': 'test-repository' + } + + +@pytest.fixture +def sample_resource(): + """Sample resource (collection) data from ArchivesSpace API.""" + return { + 'uri': '/repositories/2/resources/123', + 'id': 123, + 'title': 'Test Collection', + 'ead_id': 'test-collection-123', + 'publish': True, + 'linked_agents': [ + { + 'role': 'creator', + 'ref': '/agents/corporate_entities/1' + } + ] + } + + +@pytest.fixture +def sample_agent(): + """Sample agent (creator) data from ArchivesSpace API.""" + return { + 'uri': '/agents/corporate_entities/1', + 'id': 1, + 'title': 'Test Organization', + 'display_name': { + 'sort_name': 'Test Organization' + }, + 'is_user': False, + 'system_generated': False, + 'is_repo_agent': False, + 'linked_agent_roles': ['creator'], + 'is_linked_to_published_record': True, + 'notes': [] + } + + + +@pytest.fixture +def sample_agent_with_bioghist(): + """Sample agent with biographical/historical note.""" + return { + 'uri': '/agents/people/42', + 'id': 42, + 'title': 'Jane Doe', + 'display_name': { + 'sort_name': 'Doe, Jane' + }, + 'is_user': False, + 'system_generated': False, + 'notes': [ + { + 'jsonmodel_type': 'note_bioghist', + 'persistent_id': 'abc123', + 'subnotes': [ + { + 'content': 'Jane Doe was a pioneering librarian.\nShe worked from 1950 to 1990.' + } + ] + } + ] + } + + +@pytest.fixture +def sample_ead_xml(): + """Sample EAD XML content for testing.""" + return b''' +Jane Doe was a
would become <p> and break XML structure + preserved = bioghist_content + + assert '
' in preserved
+ assert '
' in correctly_preserved + + +@pytest.mark.integration +class TestGetCreatorBioghist: + """Tests for get_creator_bioghist method.""" + + def test_extract_bioghist_basic(self, mock_asnake_client): + """Test basic biographical note extraction.""" + from arcflow.main import ArcFlow + + arcflow = Mock(spec=ArcFlow) + arcflow.client = mock_asnake_client + arcflow.log = logging.getLogger('test') + + # Mock agent response + agent_data = { + 'title': 'John Smith', + 'notes': [ + { + 'jsonmodel_type': 'note_bioghist', + 'persistent_id': 'abc123', + 'subnotes': [ + { + 'content': 'John Smith was a librarian.\nHe worked from 1960-1990.' + } + ] + } + ] + } + mock_asnake_client.get.return_value.json.return_value = agent_data + + resource = { + 'linked_agents': [ + { + 'role': 'creator', + 'ref': '/agents/people/123' + } + ] + } + + result = ArcFlow.get_creator_bioghist(arcflow, resource) + + assert result is not None + assert 'John Smith' in result + assert '
John Smith was a librarian.
' in result + + def test_bioghist_xml_not_escaped(self, mock_asnake_client): + """Test that XML in bioghist content is NOT escaped.""" + from arcflow.main import ArcFlow + + arcflow = Mock(spec=ArcFlow) + arcflow.client = mock_asnake_client + arcflow.log = logging.getLogger('test') + + # Content with legitimate XML markup + agent_data = { + 'title': 'Test Agent', + 'notes': [ + { + 'jsonmodel_type': 'note_bioghist', + 'persistent_id': 'xyz789', + 'subnotes': [ + { + 'content': 'Agent withFounded in
Historical information.
' + + # Combine properly + combined = f'{label_escaped}{content}' + + assert '&' in combined # Escaped label + assert '' in combined # Preserved XML + assert '<p>' not in combined # XML not double-escaped + + +@pytest.mark.integration +class TestBioghlistContentHandling: + """Integration tests for bioghist content handling.""" + + def test_paragraph_wrapping(self): + """Test that content lines are wrapped in
tags.""" + content = "Line 1\nLine 2\nLine 3" + lines = [line.strip() for line in content.split('\n') if line.strip()] + + paragraphs = [f'
{line}
' for line in lines] + result = '\n'.join(paragraphs) + + assert result == 'Line 1
\nLine 2
\nLine 3
' + + def test_bioghist_element_structure(self): + """Test complete bioghist element structure.""" + persistent_id = 'abc123' + agent_name = 'Test Agent' + content_paragraphs = 'Historical note.
' + + # Should include id attribute with aspace_ prefix + bioghist_el = f'Historical note.
' in bioghist_el + + def test_missing_persistent_id_handling(self): + """Test bioghist without persistent_id (shouldn't have id attribute).""" + agent_name = 'Test Agent' + content = 'Content
' + + # Without persistent_id, no id attribute + bioghist_el = f'