diff --git a/docs/tutorials/using_cli.md b/docs/tutorials/using_cli.md index 523c28e..91acee8 100644 --- a/docs/tutorials/using_cli.md +++ b/docs/tutorials/using_cli.md @@ -11,7 +11,6 @@ The Parxy CLI lets you: | Command | Description | |------------------|-------------------------------------------------------------------------------------------------------------| | `parxy parse` | Extract text content from documents with customizable detail levels and output formats. Process files or folders with multiple drivers. | -| `parxy preview` | Interactive document viewer with metadata, table of contents, and scrollable content preview | | `parxy markdown` | Convert documents to Markdown files, with support for multiple drivers and folder processing | | `parxy pdf:merge`| Merge multiple PDF files into one, with support for page ranges | | `parxy pdf:split`| Split a PDF into individual pages, with optional page range and single-file extraction | @@ -133,47 +132,6 @@ Process all PDFs in a folder with two drivers, output as JSON, and save to a spe parxy parse /path/to/pdfs -d pymupdf -d llamaparse -m json -o output/ ``` -## Previewing Documents - -The `preview` command provides an interactive document viewer that displays: -- Document metadata (title, author, creation date, etc.) -- Table of contents extracted from headings -- Document content rendered as markdown - -This is useful for quickly inspecting a document's structure and content without creating output files. - -### Basic Usage - -```bash -parxy preview document.pdf -``` - -The preview is displayed in a scrollable three-panel layout. - -### Options - -Specify a driver: - -```bash -parxy preview document.pdf --driver llamaparse -``` - -Adjust extraction level: - -```bash -parxy preview document.pdf --level line -``` - -### Navigation - -The preview uses your system's default pager (similar to `less` on Unix systems), allowing you to: -- Scroll up and down -- Search for text -- Exit the preview - -This is ideal for quick document inspection before running a full parsing operation. - - ## Converting to Markdown The `markdown` command converts documents to Markdown format, preserving structure such as headings and lists. It follows the same conventions as the `parse` command: output files are prefixed with the driver name and saved next to the source file by default. @@ -407,7 +365,6 @@ With the CLI, you can use Parxy as a **standalone document parsing tool** — id | Command | Purpose | |------------------|--------------------------------------------------------------| | `parxy parse` | Extract text from documents with multiple formats & drivers | -| `parxy preview` | Interactive document viewer with metadata and TOC | | `parxy markdown` | Generate Markdown files; accepts JSON results and supports `--page-separators` | | `parxy pdf:merge`| Merge multiple PDF files with page range support | | `parxy pdf:split`| Split PDF into individual pages; supports `--pages` and `--combine` | diff --git a/src/parxy_cli/cli.py b/src/parxy_cli/cli.py index 9db4ac5..3f33df0 100644 --- a/src/parxy_cli/cli.py +++ b/src/parxy_cli/cli.py @@ -11,7 +11,6 @@ from parxy_cli.console.console import Console from parxy_cli.commands.docker import app as docker_command from parxy_cli.commands.parse import app as parse_command -from parxy_cli.commands.preview import app as preview_command from parxy_cli.commands.drivers import app as drivers_command from parxy_cli.commands.env import app as env_command from parxy_cli.commands.version import app as version_command @@ -70,7 +69,6 @@ def main( app.add_typer(docker_command) app.add_typer(parse_command) -app.add_typer(preview_command) app.add_typer(drivers_command) app.add_typer(env_command) app.add_typer(version_command) diff --git a/src/parxy_cli/commands/preview.py b/src/parxy_cli/commands/preview.py deleted file mode 100644 index 1ed311b..0000000 --- a/src/parxy_cli/commands/preview.py +++ /dev/null @@ -1,216 +0,0 @@ -"""Preview command for interactive document viewing.""" - -from typing import Optional, Annotated, List - -import typer -from rich.layout import Layout -from rich.text import Text -from rich.panel import Panel -from rich.markdown import Markdown - -from parxy_core.facade import Parxy -from parxy_core.models.models import Document, TextBlock, Metadata - -from parxy_cli.models import Level -from parxy_cli.console.console import Console - -app = typer.Typer() - -console = Console() - - -def extract_toc(doc: Document) -> List[dict]: - """ - Extract table of contents from document headings. - - Args: - doc: The parsed document - - Returns: - List of dicts with keys: 'text', 'level', 'page' - """ - toc = [] - for page in doc.pages: - if page.blocks: - for block in page.blocks: - if isinstance(block, TextBlock) and block.category in [ - 'heading', - 'title', - 'header', - ]: - toc.append( - { - 'text': block.text.strip(), - 'level': block.level or 1, - 'page': page.number, - } - ) - return toc - - -def format_metadata(metadata: Metadata, page_count: int) -> str: - """ - Format document metadata for display. - - Args: - metadata: Document metadata object - page_count: Number of pages in document - - Returns: - Formatted metadata string - """ - - lines = [] - - if metadata is not None and metadata.title: - lines.append(f'[bold]Title:[/bold] {metadata.title}') - if metadata is not None and metadata.author: - lines.append(f'[bold]Author:[/bold] {metadata.author}') - if metadata is not None and metadata.subject: - lines.append(f'[bold]Subject:[/bold] {metadata.subject}') - if metadata is not None and metadata.keywords: - lines.append(f'[bold]Keywords:[/bold] {metadata.keywords}') - if metadata is not None and metadata.creator: - lines.append(f'[bold]Creator:[/bold] {metadata.creator}') - if metadata is not None and metadata.producer: - lines.append(f'[bold]Producer:[/bold] {metadata.producer}') - if metadata is not None and metadata.created_at: - lines.append(f'[bold]Created:[/bold] {metadata.created_at}') - if metadata is not None and metadata.updated_at: - lines.append(f'[bold]Updated:[/bold] {metadata.updated_at}') - - lines.append(f'[bold]Pages:[/bold] {page_count}') - - return '\n'.join(lines) if lines else '[dim]No metadata available[/dim]' - - -def render_viewer_mode(doc: Document) -> Layout: - """ - Render document in viewer mode with three panels: metadata, TOC, and content. - - Args: - doc: The parsed document - - Returns: - Layout object with three-panel view - """ - # Extract TOC - toc = extract_toc(doc) - - # Format metadata - metadata_text = format_metadata(doc.metadata, len(doc.pages)) - - # Format TOC - if toc: - toc_lines = [] - for item in toc: - indent = ' ' * (item['level'] - 1) - toc_lines.append(f'{indent}• {item["text"]} [dim](p.{item["page"]})[/dim]') - toc_text = '\n'.join(toc_lines) - else: - toc_text = '[dim]No headings found[/dim]' - - # Create markdown content - markdown_content = doc.markdown() - - # Create three panels - metadata_panel = Panel( - Text.from_markup(metadata_text), - padding=(0, 1), - ) - - toc_panel = Panel( - Text.from_markup(toc_text), - title='Contents', - border_style='blue', - title_align='left', - padding=(1, 2), - ) - - content_panel = Panel( - Markdown(markdown_content, code_theme='monokai'), - title='Document preview in markdown', - title_align='left', - padding=(1, 2), - ) - - # Use Layout to create three-column view - layout = Layout() - - layout.split_column( - Layout(metadata_panel, name='header', minimum_size=3), - Layout(name='body', ratio=2), - ) - - layout['body'].split_row( - Layout(toc_panel, name='toc', ratio=1), - Layout(content_panel, name='content', ratio=2), - ) - - return layout - - -@app.command() -def preview( - file: Annotated[ - str, - typer.Argument( - help='File to preview', - exists=True, - file_okay=True, - dir_okay=False, - readable=True, - ), - ], - driver: Annotated[ - Optional[str], - typer.Option( - '--driver', - '-d', - help='Driver to use for parsing (default: pymupdf or PARXY_DEFAULT_DRIVER)', - ), - ] = None, - level: Annotated[ - Level, - typer.Option( - '--level', - '-l', - help='Extraction level', - ), - ] = Level.BLOCK, - env_file: Annotated[ - str, - typer.Option( - '--env', - '-e', - help='Path to .env file with configuration', - exists=True, - file_okay=True, - dir_okay=False, - readable=True, - ), - ] = '.env', -): - """ - Preview a document in an interactive viewer with metadata, table of contents, and content. - - The preview is displayed in a scrollable three-panel layout showing: - - Document metadata (title, author, pages, etc.) - - Table of contents extracted from headings - - Document content rendered as markdown - """ - - with console.shimmer(f'Processing {file} using {driver}...'): - # Parse the document - doc = Parxy.parse( - file=file, - level=level.value, - driver_name=driver, - ) - - # Render the viewer layout - layout = render_viewer_mode(doc) - - # Use Rich's pager for scrollable output with styles preserved - console.print(layout) - # with console.pager(styles=True): diff --git a/tests/commands/test_preview.py b/tests/commands/test_preview.py deleted file mode 100644 index 06a92af..0000000 --- a/tests/commands/test_preview.py +++ /dev/null @@ -1,328 +0,0 @@ -"""Test suite for the preview command.""" - -from unittest.mock import patch -import pytest -from typer.testing import CliRunner -from click.utils import strip_ansi - -from parxy_cli.commands.preview import ( - app, - extract_toc, - format_metadata, - render_viewer_mode, -) -from parxy_core.models import Document, Page, Metadata, TextBlock, BoundingBox - - -@pytest.fixture -def runner(): - """Fixture providing a CLI runner.""" - return CliRunner() - - -@pytest.fixture -def mock_document_simple(): - """Fixture providing a simple mock document with one page.""" - return Document( - pages=[Page(number=1, width=612.0, height=792.0, text='Test content')] - ) - - -@pytest.fixture -def mock_document_with_metadata(): - """Fixture providing a mock document with metadata.""" - metadata = Metadata( - title='Test Document', - author='Test Author', - subject='Test Subject', - keywords='test, document', - creator='Test Creator', - producer='Test Producer', - created_at='2024-01-01T00:00:00', - updated_at='2024-01-02T00:00:00', - ) - return Document( - pages=[Page(number=1, width=612.0, height=792.0, text='Test content')], - metadata=metadata, - ) - - -@pytest.fixture -def mock_document_with_headings(): - """Fixture providing a mock document with headings for TOC.""" - heading_block = TextBlock( - type='text', - text='Chapter 1: Introduction', - category='heading', - level=1, - page=1, - bbox=BoundingBox(x0=0, y0=0, x1=100, y1=20), - ) - - text_block = TextBlock( - type='text', - text='This is some content.', - category='text', - page=1, - bbox=BoundingBox(x0=0, y0=25, x1=100, y1=40), - ) - - page = Page( - number=1, - width=612.0, - height=792.0, - text='Chapter 1: Introduction\nThis is some content.', - blocks=[heading_block, text_block], - ) - - return Document(pages=[page]) - - -def test_preview_command_calls_facade_correctly(runner, mock_document_simple, tmp_path): - """Test that the preview command correctly invokes the Parxy facade.""" - - # Create a test PDF file - test_file = tmp_path / 'test.pdf' - test_file.write_text('dummy pdf content') - - with patch('parxy_cli.commands.preview.Parxy') as mock_parxy: - # Setup the mock to return our test document - mock_parxy.parse.return_value = mock_document_simple - - # Run the command with a test file - result = runner.invoke(app, [str(test_file)]) - - # Assert the command executed successfully - assert result.exit_code == 0 - - # Assert Parxy.parse was called with the correct arguments - mock_parxy.parse.assert_called_once_with( - file=str(test_file), - level='block', # default level - driver_name=None, # default driver - ) - - -def test_preview_command_with_custom_driver(runner, mock_document_simple, tmp_path): - """Test that the preview command correctly handles custom driver.""" - - # Create a test PDF file - test_file = tmp_path / 'test.pdf' - test_file.write_text('dummy pdf content') - - with patch('parxy_cli.commands.preview.Parxy') as mock_parxy: - mock_parxy.parse.return_value = mock_document_simple - - # Run command with custom driver - result = runner.invoke(app, [str(test_file), '--driver', 'llamaparse']) - - assert result.exit_code == 0 - - # Assert Parxy.parse was called with custom driver - mock_parxy.parse.assert_called_once_with( - file=str(test_file), level='block', driver_name='llamaparse' - ) - - -def test_preview_command_with_custom_level(runner, mock_document_simple, tmp_path): - """Test that the preview command correctly handles custom extraction level.""" - - # Create a test PDF file - test_file = tmp_path / 'test.pdf' - test_file.write_text('dummy pdf content') - - with patch('parxy_cli.commands.preview.Parxy') as mock_parxy: - mock_parxy.parse.return_value = mock_document_simple - - # Run command with custom level - result = runner.invoke(app, [str(test_file), '--level', 'page']) - - assert result.exit_code == 0 - - # Assert Parxy.parse was called with custom level - mock_parxy.parse.assert_called_once_with( - file=str(test_file), level='page', driver_name=None - ) - - -def test_extract_toc_with_headings(mock_document_with_headings): - """Test that extract_toc correctly extracts headings from document.""" - - toc = extract_toc(mock_document_with_headings) - - assert len(toc) == 1 - assert toc[0]['text'] == 'Chapter 1: Introduction' - assert toc[0]['level'] == 1 - assert toc[0]['page'] == 1 - - -def test_extract_toc_without_headings(mock_document_simple): - """Test that extract_toc returns empty list when no headings found.""" - - toc = extract_toc(mock_document_simple) - - assert len(toc) == 0 - - -def test_format_metadata_with_full_metadata(mock_document_with_metadata): - """Test that format_metadata correctly formats all metadata fields.""" - - formatted = format_metadata(mock_document_with_metadata.metadata, 1) - - assert 'Title:' in formatted - assert 'Test Document' in formatted - assert 'Author:' in formatted - assert 'Test Author' in formatted - assert 'Subject:' in formatted - assert 'Keywords:' in formatted - assert 'Creator:' in formatted - assert 'Producer:' in formatted - assert 'Created:' in formatted - assert 'Updated:' in formatted - assert 'Pages:[/bold] 1' in formatted # Includes Rich markup - - -def test_format_metadata_with_no_metadata(): - """Test that format_metadata handles None metadata gracefully.""" - - formatted = format_metadata(None, 5) - - assert 'Pages:[/bold] 5' in formatted # Includes Rich markup - # Should not crash, and should still show page count - - -def test_format_metadata_with_empty_metadata(): - """Test that format_metadata handles empty metadata object.""" - - metadata = Metadata() - formatted = format_metadata(metadata, 3) - - # Should show page count even with empty metadata - assert 'Pages:[/bold] 3' in formatted # Includes Rich markup - - -def test_render_viewer_mode_creates_layout(mock_document_with_metadata): - """Test that render_viewer_mode creates a proper Layout object.""" - - layout = render_viewer_mode(mock_document_with_metadata) - - # Verify we get a Layout object back - from rich.layout import Layout - - assert isinstance(layout, Layout) - - # Verify the layout was created successfully (basic check) - # The layout should have nested children - assert layout is not None - - -def test_preview_command_handles_parsing_errors(runner, tmp_path): - """Test that the preview command properly handles parsing errors.""" - - # Create a test PDF file - test_file = tmp_path / 'test.pdf' - test_file.write_text('dummy pdf content') - - with patch('parxy_cli.commands.preview.Parxy') as mock_parxy: - # Setup the mock to raise an exception - mock_parxy.parse.side_effect = Exception('Test parsing error') - - # Run the command - result = runner.invoke(app, [str(test_file)]) - - # Command should exit with error - assert result.exit_code != 0 - assert isinstance(result.exception, Exception) - - -def test_preview_command_output_contains_content( - runner, mock_document_with_metadata, tmp_path -): - """Test that preview command output includes document content.""" - - # Create a test PDF file - test_file = tmp_path / 'test.pdf' - test_file.write_text('dummy pdf content') - - with patch('parxy_cli.commands.preview.Parxy') as mock_parxy: - mock_parxy.parse.return_value = mock_document_with_metadata - - # Run command - result = runner.invoke(app, [str(test_file)]) - - assert result.exit_code == 0 - - cleaned_output = strip_ansi(result.stdout) - assert 'Test content' in cleaned_output - - -def test_extract_toc_with_multiple_heading_levels(): - """Test that extract_toc handles multiple heading levels correctly.""" - - heading1 = TextBlock( - type='text', - text='Chapter 1', - category='heading', - level=1, - page=1, - bbox=BoundingBox(x0=0, y0=0, x1=100, y1=20), - ) - - heading2 = TextBlock( - type='text', - text='Section 1.1', - category='heading', - level=2, - page=1, - bbox=BoundingBox(x0=0, y0=25, x1=100, y1=40), - ) - - page = Page( - number=1, - width=612.0, - height=792.0, - text='Chapter 1\nSection 1.1', - blocks=[heading1, heading2], - ) - - doc = Document(pages=[page]) - - toc = extract_toc(doc) - - assert len(toc) == 2 - assert toc[0]['text'] == 'Chapter 1' - assert toc[0]['level'] == 1 - assert toc[1]['text'] == 'Section 1.1' - assert toc[1]['level'] == 2 - - -def test_extract_toc_with_title_category(): - """Test that extract_toc includes blocks with 'title' category.""" - - title_block = TextBlock( - type='text', - text='Document Title', - category='title', - level=1, - page=1, - bbox=BoundingBox(x0=0, y0=0, x1=100, y1=20), - ) - - page = Page( - number=1, - width=612.0, - height=792.0, - text='Document Title', - blocks=[title_block], - ) - - doc = Document(pages=[page]) - - toc = extract_toc(doc) - - assert len(toc) == 1 - assert toc[0]['text'] == 'Document Title' - # The TOC dictionary only has text, level, and page keys - assert 'text' in toc[0] - assert 'level' in toc[0] - assert 'page' in toc[0]