From f248b9dc61a0739e0a506f27fbb0d93762ed0e64 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 13:45:04 +0000
Subject: [PATCH 01/15] =?UTF-8?q?fix:=20add=20venv=20and=20local=5Ftesting?=
 =?UTF-8?q?=20to=20.gitignore=20Rapid=20local=20performance=20test=20envir?=
 =?UTF-8?q?onment=20supporting=20the=20Polars=E2=80=91based=20transformati?=
 =?UTF-8?q?on=20rewrite=20in=20digital-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index a08611ff..e631b4d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,8 @@ demodata/
 .eggs
 *.gfs
 .venv
+/venv
+/local_testing
 .direnv
 var/cache
 /collection

From e58d438d0acf552472e3e1d2e299db39269a9764 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 22:11:32 +0000
Subject: [PATCH 02/15] =?UTF-8?q?feat:=20add=20command-line=20interface=20?=
 =?UTF-8?q?for=20title-boundary=20pipeline=20with=20argument=20parsing=20R?=
 =?UTF-8?q?apid=20local=20performance=20test=20environment=20supporting=20?=
 =?UTF-8?q?the=20Polars=E2=80=91based=20transformation=20rewrite=20in=20di?=
 =?UTF-8?q?gital-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore           |  15 ++++-
 local_testing/cli.py | 143 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 156 insertions(+), 2 deletions(-)
 create mode 100644 local_testing/cli.py

diff --git a/.gitignore b/.gitignore
index e631b4d7..0d82846f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,7 +15,6 @@ demodata/
 *.gfs
 .venv
 /venv
-/local_testing
 .direnv
 var/cache
 /collection
@@ -37,4 +36,16 @@ docs/modules.rst
 
 # don't store data folder for use as storage for notebooks
 notebooks/data/
-notebooks/.ipynb_checkpoints
\ No newline at end of file
+notebooks/.ipynb_checkpoints
+
+# local_testing
+/local_testing/cache/
+/local_testing/converted/
+/local_testing/extracted/
+/local_testing/output/
+/local_testing/pipeline/
+/local_testing/polars_phases/
+/local_testing/raw/
+/local_testing/reports/
+/local_testing/specification/
+/local_testing/venv/
diff --git a/local_testing/cli.py b/local_testing/cli.py
new file mode 100644
index 00000000..4ab74687
--- /dev/null
+++ b/local_testing/cli.py
@@ -0,0 +1,143 @@
+"""
+Command-line interface for title-boundary pipeline.
+
+Handles argument parsing and provides user-facing CLI functions.
+"""
+
+import argparse
+from typing import List, Dict
+
+from file_downloader import FileDownloader
+
+
+class CLI:
+    """Command-line interface manager."""
+
+    ENDPOINT_CSV_URL = "https://raw.githubusercontent.com/digital-land/config/main/collection/title-boundary/endpoint.csv"
+
+    @staticmethod
+    def create_parser() -> argparse.ArgumentParser:
+        """
+        Create argument parser for CLI.
+
+        Returns:
+            Configured ArgumentParser instance
+        """
+        parser = argparse.ArgumentParser(
+            description="Title Boundary Pipeline - Download, Convert, and Transform",
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            epilog="""
+Examples:
+  python main.py                            # List available LAs
+  python main.py --la "Buckinghamshire"     # Process Buckinghamshire
+  python main.py --la "Buckinghamshire" --limit 100   # Limit to 100 records
+  python main.py --use-duckdb --use-parquet           # Best performance
+            """,
+        )
+
+        parser.add_argument(
+            "--la", type=str, help="Local Authority name (partial match)"
+        )
+        parser.add_argument(
+            "--limit", type=int, help="Limit number of records to process"
+        )
+        parser.add_argument(
+            "--skip-download",
+            action="store_true",
+            help="Skip download, use existing data",
+        )
+        parser.add_argument(
+            "--list", action="store_true", help="List available Local Authorities"
+        )
+        parser.add_argument(
+            "--use-duckdb",
+            action="store_true",
+            help="Use DuckDB for GML conversion (faster)",
+        )
+        parser.add_argument(
+            "--use-parquet", action="store_true", help="Output Parquet instead of CSV"
+        )
+        parser.add_argument(
+            "--compare",
+            action="store_true",
+            help="Run both original and Polars pipelines for performance comparison",
+        )
+
+        return parser
+
+    @classmethod
+    def fetch_endpoint_list(cls) -> List[Dict]:
+        """
+        Fetch list of available endpoints from Land Registry API.
+
+        Returns:
+            List of endpoint dictionaries
+        """
+        return FileDownloader().fetch_endpoint_list()
+
+    @staticmethod
+    def get_la_name_from_url(url: str) -> str:
+        """
+        Extract Local Authority name from endpoint URL.
+
+        Args:
+            url: Endpoint URL
+
+        Returns:
+            Formatted LA name
+        """
+        return FileDownloader.get_la_name_from_url(url)
+
+    @classmethod
+    def list_available_las(cls):
+        """List all available Local Authorities to console."""
+        endpoints = cls.fetch_endpoint_list()
+
+        print(f"\n{'='*60}")
+        print("Available Local Authorities")
+        print(f"{'='*60}\n")
+
+        for i, ep in enumerate(endpoints, 1):
+            name = ep.get("local_authority", "Unknown")
+            print(f"  {i:3d}. {name}")
+
+        print(f"\n{'='*60}")
+        print(f"Total: {len(endpoints)} Local Authorities")
+        print(f"{'='*60}\n")
+
+        return endpoints
+
+    @classmethod
+    def find_matching_la(cls, search_term: str) -> tuple:
+        """
+        Find Local Authority matching search term.
+
+        Args:
+            search_term: Partial LA name to search for
+
+        Returns:
+            Tuple of (matching_endpoint, la_name) or (None, None) if no match/multiple matches
+        """
+        endpoints = cls.fetch_endpoint_list()
+        matching = [
+            ep
+            for ep in endpoints
+            if search_term.lower() in ep.get("local_authority", "").lower()
+        ]
+
+        if not matching:
+            print(f"Error: No Local Authority matching '{search_term}'")
+            print("Use --list to see available options")
+            return None, None
+
+        if len(matching) > 1:
+            print(f"Multiple matches for '{search_term}':")
+            for ep in matching:
+                print(f"  - {ep.get('local_authority', 'Unknown')}")
+            print("Please be more specific")
+            return None, None
+
+        endpoint = matching[0]
+        la_name = endpoint.get("local_authority", "Unknown")
+
+        return endpoint, la_name

From c3f82f4bac7fdf82a61ad49228fad268476b304b Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 22:11:51 +0000
Subject: [PATCH 03/15] =?UTF-8?q?feat:=20implement=20file=20downloader=20f?=
 =?UTF-8?q?or=20title-boundary=20GML=20files=20with=20progress=20tracking?=
 =?UTF-8?q?=20Rapid=20local=20performance=20test=20environment=20supportin?=
 =?UTF-8?q?g=20the=20Polars=E2=80=91based=20transformation=20rewrite=20in?=
 =?UTF-8?q?=20digital-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 local_testing/file_downloader.py | 177 +++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 local_testing/file_downloader.py

diff --git a/local_testing/file_downloader.py b/local_testing/file_downloader.py
new file mode 100644
index 00000000..e13fc40b
--- /dev/null
+++ b/local_testing/file_downloader.py
@@ -0,0 +1,177 @@
+"""
+File downloader for title-boundary GML files.
+
+Handles fetching endpoint lists from GitHub config repository
+and downloading ZIP files with progress tracking.
+"""
+
+import csv
+import urllib.request
+from pathlib import Path
+from typing import List, Optional
+
+try:
+    import requests
+    HAS_REQUESTS = True
+except ImportError:
+    HAS_REQUESTS = False
+
+
+class FileDownloader:
+    """Handles downloading title-boundary files from endpoint CSV."""
+
+    ENDPOINT_CSV_URL = "https://raw.githubusercontent.com/digital-land/config/main/collection/title-boundary/endpoint.csv"
+
+    def __init__(self, endpoint_csv_url: Optional[str] = None):
+        """Initialize downloader with optional custom endpoint CSV URL."""
+        self.endpoint_csv_url = endpoint_csv_url or self.ENDPOINT_CSV_URL
+
+    def fetch_endpoint_list(self) -> List[dict]:
+        """Fetch list of available title boundary datasets from GitHub CSV."""
+        print(f"  Fetching endpoint list from {self.endpoint_csv_url}...")
+
+        req = urllib.request.Request(
+            self.endpoint_csv_url,
+            headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'}
+        )
+        
+        with urllib.request.urlopen(req) as response:
+            content = response.read().decode('utf-8')
+            reader = csv.DictReader(content.splitlines())
+            
+            endpoints = []
+            for row in reader:
+                url = row.get('endpoint-url', '').strip()
+                if url:
+                    endpoints.append({
+                        'endpoint': row.get('endpoint', ''),
+                        'url': url,
+                        'local_authority': self.get_la_name_from_url(url),
+                        'entry_date': row.get('entry-date', ''),
+                    })
+
+        print(f"  Found {len(endpoints)} endpoints")
+        return endpoints
+
+    @staticmethod
+    def get_la_name_from_url(url: str) -> str:
+        """Extract Local Authority name from download URL."""
+        # URL format: .../download/Buckinghamshire_Council.zip
+        parts = url.split("/")
+        if parts:
+            filename = parts[-1].replace(".zip", "").replace("_", " ")
+            # Remove common suffixes for cleaner names
+            for suffix in [" Council", " Borough Council", " City Council", " District Council", 
+                          " Metropolitan Borough Council", " County Council"]:
+                if filename.endswith(suffix):
+                    filename = filename[:-len(suffix)]
+                    break
+            # Remove prefixes
+            for prefix in ["Borough of ", "City of ", "County of ", "Royal Borough of ",
+                          "London Borough of "]:
+                if filename.startswith(prefix):
+                    filename = filename[len(prefix):]
+                    break
+            return filename.strip()
+        return "Unknown"
+
+    def download_file(
+        self, url: str, output_path: Path, chunk_size: int = 8192
+    ) -> Path:
+        """
+        Download file from URL to output path with progress tracking.
+
+        Args:
+            url: URL to download from
+            output_path: Path where file should be saved
+            chunk_size: Size of download chunks in bytes
+
+        Returns:
+            Path to downloaded file
+        """
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        print(f"  Downloading from {url}")
+        print(f"  Output: {output_path}")
+
+        # Use requests library if available (better redirect/cookie handling)
+        if HAS_REQUESTS:
+            return self._download_with_requests(url, output_path, chunk_size)
+        else:
+            return self._download_with_urllib(url, output_path, chunk_size)
+    
+    def _download_with_requests(self, url: str, output_path: Path, chunk_size: int) -> Path:
+        """Download using requests library (handles redirects better)."""
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-GB,en;q=0.9',
+        }
+        
+        session = requests.Session()
+        session.headers.update(headers)
+        
+        response = session.get(url, stream=True, allow_redirects=True, timeout=30)
+        response.raise_for_status()
+        
+        total_size = int(response.headers.get('content-length', 0))
+        downloaded = 0
+        
+        with open(output_path, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=chunk_size):
+                if chunk:
+                    f.write(chunk)
+                    downloaded += len(chunk)
+                    
+                    if total_size > 0:
+                        progress = (downloaded / total_size) * 100
+                        mb_downloaded = downloaded / (1024 * 1024)
+                        mb_total = total_size / (1024 * 1024)
+                        print(f"\r  Progress: {progress:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)", end="", flush=True)
+        
+        print()  # New line after progress
+        print(f"  ✓ Downloaded {downloaded:,} bytes")
+        return output_path
+    
+    def _download_with_urllib(self, url: str, output_path: Path, chunk_size: int) -> Path:
+        """Download using urllib (fallback)."""
+
+        # Add comprehensive browser headers to mimic real browser
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+            'Accept-Language': 'en-GB,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'none',
+        }
+        
+        req = urllib.request.Request(url, headers=headers)
+        
+        with urllib.request.urlopen(req) as response:
+            total_size = int(response.headers.get("content-length", 0))
+            downloaded = 0
+
+            with open(output_path, "wb") as f:
+                while True:
+                    chunk = response.read(chunk_size)
+                    if not chunk:
+                        break
+                    f.write(chunk)
+                    downloaded += len(chunk)
+
+                    if total_size > 0:
+                        progress = (downloaded / total_size) * 100
+                        print(
+                            f"\r  Progress: {progress:.1f}% ({downloaded:,}/{total_size:,} bytes)",
+                            end="",
+                        )
+
+        print()  # New line after progress
+        size_mb = output_path.stat().st_size / (1024 * 1024)
+        print(f"  Downloaded: {size_mb:.1f} MB")
+
+        return output_path

From 0b2c2dcb013f03300c1cf98b580ae40ebd5713a8 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 22:12:03 +0000
Subject: [PATCH 04/15] =?UTF-8?q?feat:=20add=20GML=20converter=20with=20mu?=
 =?UTF-8?q?ltiple=20output=20formats=20including=20CSV=20and=20Parquet=20R?=
 =?UTF-8?q?apid=20local=20performance=20test=20environment=20supporting=20?=
 =?UTF-8?q?the=20Polars=E2=80=91based=20transformation=20rewrite=20in=20di?=
 =?UTF-8?q?gital-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 local_testing/gml_converter.py | 458 +++++++++++++++++++++++++++++++++
 1 file changed, 458 insertions(+)
 create mode 100644 local_testing/gml_converter.py

diff --git a/local_testing/gml_converter.py b/local_testing/gml_converter.py
new file mode 100644
index 00000000..4a3b4036
--- /dev/null
+++ b/local_testing/gml_converter.py
@@ -0,0 +1,458 @@
+"""
+GML converter for title-boundary datasets.
+
+Provides multiple conversion strategies:
+- Regex-based CSV conversion
+- Polars-based Parquet conversion
+- DuckDB-based conversion (fastest, with spatial transforms)
+"""
+
+import csv
+import re
+from pathlib import Path
+from typing import Optional
+
+
+class GMLConverter:
+    """Converts GML files to CSV/Parquet with multiple strategies."""
+    
+    @staticmethod
+    def extract_polygon_wkt(geometry_text: str) -> str:
+        """
+        Extract polygon coordinates and convert to WKT format.
+        
+        Handles both exterior rings and interior rings (holes).
+        
+        Args:
+            geometry_text: GML geometry element text
+            
+        Returns:
+            WKT polygon string, or empty string if no valid geometry
+        """
+        exterior_match = re.search(
+            r'<gml:exterior>.*?<gml:posList>([^<]+)</gml:posList>.*?</gml:exterior>',
+            geometry_text, re.DOTALL
+        )
+        
+        if not exterior_match:
+            return ""
+        
+        exterior_coords_raw = exterior_match.group(1).strip().split()
+        exterior_coords = []
+        for i in range(0, len(exterior_coords_raw), 2):
+            if i + 1 < len(exterior_coords_raw):
+                exterior_coords.append(f"{exterior_coords_raw[i]} {exterior_coords_raw[i+1]}")
+        
+        if not exterior_coords:
+            return ""
+        
+        # Extract interior rings (holes)
+        interior_rings = []
+        interior_matches = re.findall(
+            r'<gml:interior>.*?<gml:posList>([^<]+)</gml:posList>.*?</gml:interior>',
+            geometry_text, re.DOTALL
+        )
+        
+        for interior_coords_raw in interior_matches:
+            coords = interior_coords_raw.strip().split()
+            ring_coords = []
+            for i in range(0, len(coords), 2):
+                if i + 1 < len(coords):
+                    ring_coords.append(f"{coords[i]} {coords[i+1]}")
+            if ring_coords:
+                interior_rings.append(ring_coords)
+        
+        exterior_wkt = f"({', '.join(exterior_coords)})"
+        if interior_rings:
+            interior_wkts = [f"({', '.join(ring)})" for ring in interior_rings]
+            return f"POLYGON({exterior_wkt}, {', '.join(interior_wkts)})"
+        return f"POLYGON({exterior_wkt})"
+    
+    @staticmethod
+    def extract_field(text: str, field_name: str) -> str:
+        """
+        Extract a field value from GML text.
+        
+        Args:
+            text: GML text to search
+            field_name: Field name to extract
+            
+        Returns:
+            Field value, or empty string if not found
+        """
+        pattern = f'<LR:{field_name}>([^<]+)</LR:{field_name}>'
+        match = re.search(pattern, text)
+        return match.group(1) if match else ""
+    
+    def convert_to_csv(self, gml_path: Path, csv_path: Path, limit: Optional[int] = None) -> int:
+        """
+        Convert GML file to CSV format using regex parsing.
+        
+        This is the baseline method - slower but doesn't require external dependencies.
+        
+        Args:
+            gml_path: Path to input GML file
+            csv_path: Path to output CSV file
+            limit: Optional limit on number of records to convert
+            
+        Returns:
+            Number of records converted
+        """
+        print(f"  Converting GML to CSV...")
+        print(f"  Input:  {gml_path}")
+        print(f"  Output: {csv_path}")
+        
+        size_mb = gml_path.stat().st_size / (1024 * 1024)
+        print(f"  GML size: {size_mb:.1f} MB")
+        
+        with open(gml_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        
+        # Find all cadastral parcel elements
+        pattern = r'<LR:PREDEFINED[^>]*>(.*?)</LR:PREDEFINED>'
+        matches = re.findall(pattern, content, re.DOTALL)
+        total_features = len(matches)
+        print(f"  Found {total_features} cadastral parcels")
+        
+        if limit:
+            print(f"  Limiting to {limit} records")
+        
+        fieldnames = [
+            'reference', 'name', 'national-cadastral-reference', 'geometry',
+            'start-date', 'entry-date', 'end-date', 'prefix', 'organisation', 'notes'
+        ]
+        
+        csv_path.parent.mkdir(parents=True, exist_ok=True)
+        count = 0
+        
+        with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction='ignore')
+            writer.writeheader()
+            
+            for match in matches:
+                feature = {}
+                
+                inspire_id = self.extract_field(match, 'INSPIREID')
+                if inspire_id:
+                    feature['reference'] = inspire_id
+                    feature['name'] = inspire_id
+                
+                ncr = self.extract_field(match, 'NATIONALCADASTRALREFERENCE')
+                if ncr:
+                    feature['national-cadastral-reference'] = ncr
+                
+                valid_from = self.extract_field(match, 'VALIDFROM')
+                if valid_from:
+                    feature['start-date'] = valid_from.split('T')[0] if 'T' in valid_from else valid_from
+                
+                begin_lifespan = self.extract_field(match, 'BEGINLIFESPANVERSION')
+                if begin_lifespan:
+                    feature['entry-date'] = begin_lifespan.split('T')[0] if 'T' in begin_lifespan else begin_lifespan
+                
+                geometry_match = re.search(r'<LR:GEOMETRY>(.*?)</LR:GEOMETRY>', match, re.DOTALL)
+                if geometry_match:
+                    wkt = self.extract_polygon_wkt(geometry_match.group(1))
+                    if wkt:
+                        feature['geometry'] = wkt
+                
+                if 'reference' in feature:
+                    feature['prefix'] = 'title-boundary'
+                    feature['organisation'] = 'government-organisation:D2'
+                    writer.writerow(feature)
+                    count += 1
+                    
+                    if count % 5000 == 0:
+                        print(f"    Converted {count}/{total_features} features...")
+                    
+                    if limit and count >= limit:
+                        break
+        
+        print(f"  Converted {count} records to CSV")
+        return count
+    
+    def convert_to_parquet(self, gml_path: Path, parquet_path: Path, limit: Optional[int] = None) -> int:
+        """
+        Convert GML file to Parquet format using regex parsing + Polars.
+        
+        Parquet is faster to read than CSV and preserves data types.
+        Falls back to CSV if Polars is not installed.
+        
+        Args:
+            gml_path: Path to input GML file
+            parquet_path: Path to output Parquet file
+            limit: Optional limit on number of records to convert
+            
+        Returns:
+            Number of records converted
+        """
+        try:
+            import polars as pl
+        except ImportError:
+            print("  Polars not installed. Install with: pip install polars")
+            print("  Falling back to CSV...")
+            csv_path = parquet_path.with_suffix('.csv')
+            return self.convert_to_csv(gml_path, csv_path, limit)
+        
+        print(f"  Converting GML to Parquet...")
+        print(f"  Input:  {gml_path}")
+        print(f"  Output: {parquet_path}")
+        
+        size_mb = gml_path.stat().st_size / (1024 * 1024)
+        print(f"  GML size: {size_mb:.1f} MB")
+        
+        with open(gml_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        
+        # Find all cadastral parcel elements
+        pattern = r'<LR:PREDEFINED[^>]*>(.*?)</LR:PREDEFINED>'
+        matches = re.findall(pattern, content, re.DOTALL)
+        total_features = len(matches)
+        print(f"  Found {total_features} cadastral parcels")
+        
+        if limit:
+            print(f"  Limiting to {limit} records")
+            matches = matches[:limit]
+        
+        # Build list of records
+        records = []
+        for match in matches:
+            feature = {}
+            
+            inspire_id = self.extract_field(match, 'INSPIREID')
+            if inspire_id:
+                feature['reference'] = inspire_id
+                feature['name'] = inspire_id
+            
+            ncr = self.extract_field(match, 'NATIONALCADASTRALREFERENCE')
+            if ncr:
+                feature['national-cadastral-reference'] = ncr
+            
+            valid_from = self.extract_field(match, 'VALIDFROM')
+            if valid_from:
+                feature['start-date'] = valid_from.split('T')[0] if 'T' in valid_from else valid_from
+            
+            begin_lifespan = self.extract_field(match, 'BEGINLIFESPANVERSION')
+            if begin_lifespan:
+                feature['entry-date'] = begin_lifespan.split('T')[0] if 'T' in begin_lifespan else begin_lifespan
+            
+            geometry_match = re.search(r'<LR:GEOMETRY>(.*?)</LR:GEOMETRY>', match, re.DOTALL)
+            if geometry_match:
+                wkt = self.extract_polygon_wkt(geometry_match.group(1))
+                if wkt:
+                    feature['geometry'] = wkt
+            
+            if 'reference' in feature:
+                feature['prefix'] = 'title-boundary'
+                feature['organisation'] = 'government-organisation:D2'
+                feature['end-date'] = None
+                feature['notes'] = None
+                records.append(feature)
+        
+        # Create DataFrame and write to Parquet
+        parquet_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        df = pl.DataFrame(records)
+        df.write_parquet(parquet_path, compression='snappy')
+        
+        count = len(records)
+        print(f"  Converted {count} records to Parquet")
+        return count
+    
+    def convert_to_parquet_duckdb(self, gml_path: Path, parquet_path: Path, limit: Optional[int] = None) -> int:
+        """
+        Convert GML file to Parquet format using DuckDB with spatial extension.
+        
+        This is the fastest method - DuckDB reads GML directly and writes Parquet.
+        Falls back to Polars-based converter if DuckDB is not available.
+        
+        Args:
+            gml_path: Path to input GML file
+            parquet_path: Path to output Parquet file
+            limit: Optional limit on number of records to convert
+            
+        Returns:
+            Number of records converted
+        """
+        try:
+            import duckdb
+        except ImportError:
+            print("  DuckDB not installed. Install with: pip install duckdb")
+            print("  Falling back to Polars-based converter...")
+            return self.convert_to_parquet(gml_path, parquet_path, limit)
+        
+        print(f"  Converting GML to Parquet using DuckDB...")
+        print(f"  Input:  {gml_path}")
+        print(f"  Output: {parquet_path}")
+        
+        size_mb = gml_path.stat().st_size / (1024 * 1024)
+        print(f"  GML size: {size_mb:.1f} MB")
+        
+        parquet_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        try:
+            con = duckdb.connect()
+            try:
+                con.execute("INSTALL spatial; LOAD spatial;")
+                print("  Loaded DuckDB spatial extension")
+            except Exception as ext_err:
+                print(f"  Failed to load spatial extension: {ext_err}")
+                print("  Falling back to Polars-based converter...")
+                con.close()
+                return self.convert_to_parquet(gml_path, parquet_path, limit)
+            
+            print("  Reading GML file...")
+            limit_clause = f"LIMIT {limit}" if limit else ""
+            
+            query = f"""
+                SELECT 
+                    INSPIREID as reference,
+                    INSPIREID as name,
+                    NATIONALCADASTRALREFERENCE as "national-cadastral-reference",
+                    ST_AsText(ST_Transform(geom, 'EPSG:27700', 'EPSG:4326')) as geometry,
+                    CASE 
+                        WHEN VALIDFROM IS NOT NULL 
+                        THEN strftime(CAST(VALIDFROM AS DATE), '%Y-%m-%d')
+                        ELSE NULL 
+                    END as "start-date",
+                    CASE 
+                        WHEN BEGINLIFESPANVERSION IS NOT NULL 
+                        THEN strftime(CAST(BEGINLIFESPANVERSION AS DATE), '%Y-%m-%d')
+                        ELSE NULL 
+                    END as "entry-date",
+                    NULL as "end-date",
+                    'title-boundary' as prefix,
+                    'government-organisation:D2' as organisation,
+                    NULL as notes
+                FROM ST_Read('{gml_path}')
+                WHERE INSPIREID IS NOT NULL
+                {limit_clause}
+            """
+            
+            count_query = f"SELECT COUNT(*) FROM ST_Read('{gml_path}')"
+            total_count = con.execute(count_query).fetchone()[0]
+            print(f"  Found {total_count:,} cadastral parcels")
+            
+            if limit:
+                print(f"  Limiting to {limit} records")
+            
+            # Export directly to Parquet (much faster than CSV)
+            print("  Transforming and writing to Parquet...")
+            con.execute(f"COPY ({query}) TO '{parquet_path}' (FORMAT PARQUET, COMPRESSION 'snappy')")
+            
+            # Count output rows
+            result_count = con.execute(f"SELECT COUNT(*) FROM read_parquet('{parquet_path}')").fetchone()[0]
+            
+            con.close()
+            
+            print(f"  Converted {result_count:,} records to Parquet")
+            return result_count
+            
+        except Exception as e:
+            print(f"  DuckDB conversion failed: {e}")
+            print("  Falling back to Polars-based converter...")
+            return self.convert_to_parquet(gml_path, parquet_path, limit)
+    
+    def convert_to_csv_duckdb(self, gml_path: Path, csv_path: Path, limit: Optional[int] = None) -> int:
+        """
+        Convert GML file to CSV format using DuckDB with spatial extension.
+        
+        This is significantly faster than regex parsing and properly handles:
+        - Coordinate transformations (OSGB EPSG:27700 to WGS84 EPSG:4326)
+        - Complex geometries (multi-polygons, holes)
+        - Large files with streaming
+        
+        Note: For even better performance, use convert_to_parquet_duckdb() instead.
+        Falls back to regex-based converter if DuckDB is not available.
+        
+        Args:
+            gml_path: Path to input GML file
+            csv_path: Path to output CSV file
+            limit: Optional limit on number of records to convert
+            
+        Returns:
+            Number of records converted
+        """
+        try:
+            import duckdb
+        except ImportError:
+            print("  DuckDB not installed. Install with: pip install duckdb")
+            print("  Falling back to regex-based converter...")
+            return self.convert_to_csv(gml_path, csv_path, limit)
+        
+        print(f"  Converting GML to CSV using DuckDB...")
+        print(f"  Input:  {gml_path}")
+        print(f"  Output: {csv_path}")
+        
+        size_mb = gml_path.stat().st_size / (1024 * 1024)
+        print(f"  GML size: {size_mb:.1f} MB")
+        
+        csv_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        try:
+            # Create DuckDB connection and load spatial extension
+            con = duckdb.connect()
+            try:
+                con.execute("INSTALL spatial; LOAD spatial;")
+                print("  Loaded DuckDB spatial extension")
+            except Exception as ext_err:
+                print(f"  Failed to load spatial extension: {ext_err}")
+                print("  This may be a network issue. Try running:")
+                print("    python -c \"import duckdb; duckdb.connect().execute('INSTALL spatial')\"")
+                print("  Falling back to regex-based converter...")
+                con.close()
+                return self.convert_to_csv(gml_path, csv_path, limit)
+            
+            # Read GML file using ST_Read (GDAL-based)
+            print("  Reading GML file...")
+            
+            limit_clause = f"LIMIT {limit}" if limit else ""
+            
+            query = f"""
+                SELECT 
+                    INSPIREID as reference,
+                    INSPIREID as name,
+                    NATIONALCADASTRALREFERENCE as "national-cadastral-reference",
+                    ST_AsText(ST_Transform(geom, 'EPSG:27700', 'EPSG:4326')) as geometry,
+                    CASE 
+                        WHEN VALIDFROM IS NOT NULL 
+                        THEN strftime(CAST(VALIDFROM AS DATE), '%Y-%m-%d')
+                        ELSE NULL 
+                    END as "start-date",
+                    CASE 
+                        WHEN BEGINLIFESPANVERSION IS NOT NULL 
+                        THEN strftime(CAST(BEGINLIFESPANVERSION AS DATE), '%Y-%m-%d')
+                        ELSE NULL 
+                    END as "entry-date",
+                    NULL as "end-date",
+                    'title-boundary' as prefix,
+                    'government-organisation:D2' as organisation,
+                    NULL as notes
+                FROM ST_Read('{gml_path}')
+                WHERE INSPIREID IS NOT NULL
+                {limit_clause}
+            """
+            
+            # Execute and get count first
+            count_query = f"SELECT COUNT(*) FROM ST_Read('{gml_path}')"
+            total_count = con.execute(count_query).fetchone()[0]
+            print(f"  Found {total_count:,} cadastral parcels")
+            
+            if limit:
+                print(f"  Limiting to {limit} records")
+            
+            # Export directly to CSV
+            print("  Transforming and writing to CSV...")
+            con.execute(f"COPY ({query}) TO '{csv_path}' (HEADER, DELIMITER ',')")
+            
+            # Count output rows
+            result_count = con.execute(f"SELECT COUNT(*) FROM read_csv('{csv_path}')").fetchone()[0]
+            
+            con.close()
+            
+            print(f"  Converted {result_count:,} records to CSV")
+            return result_count
+            
+        except Exception as e:
+            print(f"  DuckDB conversion failed: {e}")
+            print("  Falling back to regex-based converter...")
+            return self.convert_to_csv(gml_path, csv_path, limit)

From ec0a0de79dc2813229e7f44fb055de5becef9ba0 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 22:12:18 +0000
Subject: [PATCH 05/15] =?UTF-8?q?feat:=20add=20GML=20extractor=20for=20tit?=
 =?UTF-8?q?le-boundary=20datasets=20with=20ZIP=20archive=20support=20Rapid?=
 =?UTF-8?q?=20local=20performance=20test=20environment=20supporting=20the?=
 =?UTF-8?q?=20Polars=E2=80=91based=20transformation=20rewrite=20in=20digit?=
 =?UTF-8?q?al-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 local_testing/gml_extractor.py | 50 ++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 local_testing/gml_extractor.py

diff --git a/local_testing/gml_extractor.py b/local_testing/gml_extractor.py
new file mode 100644
index 00000000..fb004301
--- /dev/null
+++ b/local_testing/gml_extractor.py
@@ -0,0 +1,50 @@
+"""
+GML extractor for title-boundary datasets.
+
+Handles extraction of GML files from ZIP archives.
+"""
+
+import zipfile
+from pathlib import Path
+
+
+class GMLExtractor:
+    """Extracts GML files from ZIP archives."""
+    
+    @staticmethod
+    def extract_gml_from_zip(zip_path: Path, output_dir: Path) -> Path:
+        """
+        Extract GML file from ZIP archive.
+        
+        Args:
+            zip_path: Path to ZIP file
+            output_dir: Directory to extract GML file to
+            
+        Returns:
+            Path to extracted GML file
+            
+        Raises:
+            ValueError: If no GML file found in archive
+        """
+        output_dir.mkdir(parents=True, exist_ok=True)
+        
+        print(f"  Extracting GML from {zip_path}")
+        
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            # Find GML file in archive
+            gml_files = [f for f in zip_ref.namelist() if f.lower().endswith('.gml')]
+            
+            if not gml_files:
+                raise ValueError(f"No GML file found in {zip_path}")
+            
+            gml_filename = gml_files[0]
+            print(f"  Found: {gml_filename}")
+            
+            # Extract to output directory
+            zip_ref.extract(gml_filename, output_dir)
+            
+            gml_path = output_dir / gml_filename
+            size_mb = gml_path.stat().st_size / (1024 * 1024)
+            print(f"  Extracted: {gml_path} ({size_mb:.1f} MB)")
+            
+            return gml_path

From 92558deba0115c81501883aa9df8a439c45f85ba Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 22:12:32 +0000
Subject: [PATCH 06/15] =?UTF-8?q?feat:=20add=20Makefile=20for=20title=20bo?=
 =?UTF-8?q?undary=20pipeline=20setup=20and=20management=20Rapid=20local=20?=
 =?UTF-8?q?performance=20test=20environment=20supporting=20the=20Polars?=
 =?UTF-8?q?=E2=80=91based=20transformation=20rewrite=20in=20digital-land-p?=
 =?UTF-8?q?ython=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 local_testing/Makefile | 96 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 local_testing/Makefile

diff --git a/local_testing/Makefile b/local_testing/Makefile
new file mode 100644
index 00000000..7b564da0
--- /dev/null
+++ b/local_testing/Makefile
@@ -0,0 +1,96 @@
+.PHONY: help init setup-dirs setup-spec check-spec list run fast compare test clean clean-all
+
+PYTHON := venv/bin/python3
+PIP := venv/bin/pip
+SPEC_DIR := ../specification
+
+help:
+	@echo "Title Boundary Pipeline"
+	@echo ""
+	@echo "  make init              First time setup (dirs + venv + spec check)"
+	@echo "  make setup-dirs        Create all required directories"
+	@echo "  make setup-spec        Clone specification files from GitHub"
+	@echo "  make check-spec        Check if specification files exist"
+	@echo "  make list              List available Local Authorities"
+	@echo "  make run LA=Name       Process a Local Authority"
+	@echo "  make fast LA=Name      Process with DuckDB + Parquet"
+	@echo "  make compare LA=Name   Run original + Polars for comparison"
+	@echo "  make test              Test all module imports"
+	@echo "  make clean             Remove generated data"
+	@echo "  make clean-all         Remove data + venv"
+	@echo ""
+	@echo "Examples:"
+	@echo "  make init                              # Complete setup"
+	@echo "  make setup-spec                        # Clone specification if missing"
+	@echo "  make run LA=Buckinghamshire LIMIT=100"
+	@echo "  make fast LA=\"East Sussex\""
+	@echo "  make compare LA=Buckinghamshire LIMIT=1000"
+
+setup-dirs:
+	@mkdir -p raw extracted converted output reports cache pipeline
+	@echo "✅ Created directories: raw/ extracted/ converted/ output/ reports/ cache/ pipeline/"
+
+setup-spec:
+	@if [ -d "$(SPEC_DIR)" ]; then \
+		echo "✅ Specification already exists at $(SPEC_DIR)"; \
+	else \
+		echo "📥 Cloning specification from GitHub..."; \
+		cd .. && git clone https://github.com/digital-land/specification.git; \
+		if [ -d "$(SPEC_DIR)" ]; then \
+			echo "✅ Specification cloned successfully"; \
+			echo "   Files: $$(ls -1 $(SPEC_DIR)/*.csv 2>/dev/null | wc -l | tr -d ' ') CSV files"; \
+		else \
+			echo "❌ Failed to clone specification"; \
+			exit 1; \
+		fi \
+	fi
+
+check-spec:
+	@if [ -d "$(SPEC_DIR)" ]; then \
+		echo "✅ Specification found at $(SPEC_DIR)"; \
+		echo "   Files: $$(ls -1 $(SPEC_DIR)/*.csv 2>/dev/null | wc -l | tr -d ' ') CSV files"; \
+	else \
+		echo "❌ Specification not found at $(SPEC_DIR)"; \
+		echo ""; \
+		echo "Run 'make setup-spec' to clone automatically, or:"; \
+		echo "  cd ../"; \
+		echo "  git clone https://github.com/digital-land/specification.git"; \
+		echo ""; \
+		exit 1; \
+	fi
+
+init: setup-dirs venv setup-spec
+	@echo "✅ Setup complete - ready to run pipeline"
+
+venv:
+	@python3 -m venv venv
+	@$(PIP) install -q --upgrade pip
+	@$(PIP) install -q polars duckdb requests
+	@$(PIP) install -q -e ..
+	@echo "  ✓ Installed digital-land-python in editable mode"
+
+list: venv
+	@$(PYTHON) main.py --list
+
+run: venv
+	@test -n "$(LA)" || (echo "Error: make run LA=Name"; exit 1)
+	@$(PYTHON) main.py --la "$(LA)" $(if $(LIMIT),--limit $(LIMIT))
+
+fast: venv
+	@test -n "$(LA)" || (echo "Error: make fast LA=Name"; exit 1)
+	@$(PYTHON) main.py --la "$(LA)" --use-duckdb --use-parquet $(if $(LIMIT),--limit $(LIMIT))
+
+compare: venv
+	@test -n "$(LA)" || (echo "Error: make compare LA=Name"; exit 1)
+	@$(PYTHON) main.py --la "$(LA)" --compare $(if $(LIMIT),--limit $(LIMIT))
+
+test: venv
+	@$(PYTHON) -c "from cli import CLI; from file_downloader import FileDownloader; from gml_extractor import GMLExtractor; from gml_converter import GMLConverter; from pipeline_config import PipelineConfig; from pipeline_runner import PipelineRunner; from pipeline_report import PipelineReport; print('✅ All modules OK')"
+
+clean:
+	@rm -rf raw/* extracted/* converted/* output/* reports/*
+	@echo "✅ Data cleaned"
+
+clean-all: clean
+	@rm -rf venv/ cache/*
+	@echo "✅ All cleaned"

From 52be8abd7f8e838d7ab44b8f2634091fa7d612a6 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 22:13:12 +0000
Subject: [PATCH 07/15] =?UTF-8?q?feat:=20add=20pipeline=20configuration=20?=
 =?UTF-8?q?management=20for=20title-boundary=20dataset=20Rapid=20local=20p?=
 =?UTF-8?q?erformance=20test=20environment=20supporting=20the=20Polars?=
 =?UTF-8?q?=E2=80=91based=20transformation=20rewrite=20in=20digital-land-p?=
 =?UTF-8?q?ython=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 local_testing/pipeline_config.py | 93 ++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 local_testing/pipeline_config.py

diff --git a/local_testing/pipeline_config.py b/local_testing/pipeline_config.py
new file mode 100644
index 00000000..35aaf338
--- /dev/null
+++ b/local_testing/pipeline_config.py
@@ -0,0 +1,93 @@
+"""
+Pipeline configuration management for title-boundary dataset.
+
+Handles creation and management of pipeline configuration CSV files
+and downloading of required resources like organisation.csv.
+"""
+
+import urllib.request
+from pathlib import Path
+
+
+class PipelineConfig:
+    """Manages pipeline configuration files and resources."""
+
+    @staticmethod
+    def ensure_pipeline_config(pipeline_dir: Path):
+        """
+        Ensure all required pipeline configuration CSV files exist.
+
+        Creates default configuration files for:
+        - column mapping
+        - default values
+        - patches, concatenations, combinations
+        - filters, lookups, migrations, redirects
+
+        Args:
+            pipeline_dir: Directory where pipeline config files should be created
+        """
+        pipeline_dir.mkdir(parents=True, exist_ok=True)
+
+        configs = {
+            "column.csv": """dataset,resource,column,field
+title-boundary,,reference,reference
+title-boundary,,name,name
+title-boundary,,geometry,geometry
+title-boundary,,start-date,start-date
+title-boundary,,entry-date,entry-date
+title-boundary,,end-date,end-date
+title-boundary,,prefix,prefix
+title-boundary,,organisation,organisation
+title-boundary,,notes,notes
+title-boundary,,national-cadastral-reference,notes
+""",
+            "default.csv": "dataset,resource,field,default-field,entry-date\n",
+            "patch.csv": "dataset,resource,field,pattern,value\n",
+            "concat.csv": "dataset,resource,field,fields,separator\n",
+            "combine.csv": "dataset,resource,field,fields,separator\n",
+            "convert.csv": "dataset,resource,field,value,replacement\n",
+            "filter.csv": "dataset,resource,field,pattern\n",
+            "skip.csv": "dataset,resource,pattern\n",
+            "lookup.csv": "prefix,resource,organisation,reference,entity\n",
+            "migrate.csv": "dataset,old-field,new-field\n",
+            "redirect.csv": "entity,status,redirect-entity\n",
+        }
+
+        for filename, content in configs.items():
+            filepath = pipeline_dir / filename
+            if not filepath.exists():
+                filepath.write_text(content)
+
+    @staticmethod
+    def download_organisation_csv(cache_dir: Path) -> Path:
+        """
+        Download organisation.csv from digital-land repository if not present.
+
+        Falls back to creating a minimal organisation.csv with Land Registry
+        data if download fails.
+
+        Args:
+            cache_dir: Directory where organisation.csv should be cached
+
+        Returns:
+            Path to organisation.csv file
+        """
+        org_csv = cache_dir / "organisation.csv"
+
+        if not org_csv.exists():
+            print("  Downloading organisation.csv...")
+            url = "https://raw.githubusercontent.com/digital-land/organisation-dataset/main/collection/organisation.csv"
+
+            try:
+                cache_dir.mkdir(parents=True, exist_ok=True)
+                urllib.request.urlretrieve(url, org_csv)
+                print(f"  Downloaded organisation.csv ({org_csv.stat().st_size} bytes)")
+            except Exception as e:
+                print(f"  Warning: Could not download ({e}), creating minimal file")
+                org_csv.write_text(
+                    "organisation,name,statistical-geography,opendatacommunities-uri\n"
+                    "government-organisation:D2,Land Registry,E92000001,"
+                    "http://opendatacommunities.org/id/government-organisation/land-registry\n"
+                )
+
+        return org_csv

From f391856ca54ff44dd33c66dbbee67a29de81deaa Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 22:19:56 +0000
Subject: [PATCH 08/15] =?UTF-8?q?feat:=20update=20.gitignore=20to=20includ?=
 =?UTF-8?q?e=20local=20testing=20scripts=20and=20README=20Rapid=20local=20?=
 =?UTF-8?q?performance=20test=20environment=20supporting=20the=20Polars?=
 =?UTF-8?q?=E2=80=91based=20transformation=20rewrite=20in=20digital-land-p?=
 =?UTF-8?q?ython=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.gitignore b/.gitignore
index 0d82846f..ea7b0f32 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,3 +49,8 @@ notebooks/.ipynb_checkpoints
 /local_testing/reports/
 /local_testing/specification/
 /local_testing/venv/
+
+/local_testing/main.py
+/local_testing/pipeline_report.py
+/local_testing/pipeline_runner.py
+/local_testing/README.md

From 82857ad5f44e2c80915a3f1ef72bc738dbf6a7fc Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Fri, 6 Feb 2026 11:38:03 +0000
Subject: [PATCH 09/15] =?UTF-8?q?feat:=20enhance=20Makefile=20and=20CLI=20?=
 =?UTF-8?q?for=20improved=20pipeline=20comparison=20and=20add=20run=5Fall?=
 =?UTF-8?q?=20script=20for=20batch=20processing=20Rapid=20local=20performa?=
 =?UTF-8?q?nce=20test=20environment=20supporting=20the=20Polars=E2=80=91ba?=
 =?UTF-8?q?sed=20transformation=20rewrite=20in=20digital-land-python=20Fix?=
 =?UTF-8?q?es=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 local_testing/Makefile   |  24 +++++----
 local_testing/cli.py     |   5 ++
 local_testing/run_all.py | 113 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+), 9 deletions(-)
 create mode 100755 local_testing/run_all.py

diff --git a/local_testing/Makefile b/local_testing/Makefile
index 7b564da0..53835a7a 100644
--- a/local_testing/Makefile
+++ b/local_testing/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help init setup-dirs setup-spec check-spec list run fast compare test clean clean-all
+.PHONY: help init setup-dirs setup-spec check-spec list run run-all fast compare test clean clean-all
 
 PYTHON := venv/bin/python3
 PIP := venv/bin/pip
@@ -12,19 +12,22 @@ help:
 	@echo "  make setup-spec        Clone specification files from GitHub"
 	@echo "  make check-spec        Check if specification files exist"
 	@echo "  make list              List available Local Authorities"
-	@echo "  make run LA=Name       Process a Local Authority"
-	@echo "  make fast LA=Name      Process with DuckDB + Parquet"
-	@echo "  make compare LA=Name   Run original + Polars for comparison"
+	@echo "  make run LA=Name       Process with comparison (Original + Polars)"
+	@echo "  make run-all           Process ALL LAs with comparison"
+	@echo "  make fast LA=Name      DuckDB+Parquet with comparison"
 	@echo "  make test              Test all module imports"
 	@echo "  make clean             Remove generated data"
 	@echo "  make clean-all         Remove data + venv"
 	@echo ""
+	@echo "Note: All run commands automatically include Polars comparison"
+	@echo ""
 	@echo "Examples:"
 	@echo "  make init                              # Complete setup"
 	@echo "  make setup-spec                        # Clone specification if missing"
-	@echo "  make run LA=Buckinghamshire LIMIT=100"
-	@echo "  make fast LA=\"East Sussex\""
-	@echo "  make compare LA=Buckinghamshire LIMIT=1000"
+	@echo "  make run LA=Buckinghamshire LIMIT=100  # Compare both pipelines"
+	@echo "  make run LA=Buckinghamshire PHASES=1,2,9  # Run specific phases"
+	@echo "  make run-all LIMIT=100                 # Process all LAs with comparison"
+	@echo "  make fast LA=\"East Sussex\"             # Fast mode with comparison"
 
 setup-dirs:
 	@mkdir -p raw extracted converted output reports cache pipeline
@@ -74,11 +77,14 @@ list: venv
 
 run: venv
 	@test -n "$(LA)" || (echo "Error: make run LA=Name"; exit 1)
-	@$(PYTHON) main.py --la "$(LA)" $(if $(LIMIT),--limit $(LIMIT))
+	@$(PYTHON) main.py --la "$(LA)" --compare $(if $(LIMIT),--limit $(LIMIT)) $(if $(PHASES),--phases $(PHASES))
+
+run-all: venv
+	@$(PYTHON) run_all.py $(LIMIT)
 
 fast: venv
 	@test -n "$(LA)" || (echo "Error: make fast LA=Name"; exit 1)
-	@$(PYTHON) main.py --la "$(LA)" --use-duckdb --use-parquet $(if $(LIMIT),--limit $(LIMIT))
+	@$(PYTHON) main.py --la "$(LA)" --use-duckdb --use-parquet --compare $(if $(LIMIT),--limit $(LIMIT))
 
 compare: venv
 	@test -n "$(LA)" || (echo "Error: make compare LA=Name"; exit 1)
diff --git a/local_testing/cli.py b/local_testing/cli.py
index 4ab74687..2eb0f774 100644
--- a/local_testing/cli.py
+++ b/local_testing/cli.py
@@ -62,6 +62,11 @@ def create_parser() -> argparse.ArgumentParser:
             action="store_true",
             help="Run both original and Polars pipelines for performance comparison",
         )
+        parser.add_argument(
+            "--phases",
+            type=str,
+            help="Comma-separated phase numbers to run (e.g. '1,2,9' or '1-5,9')",
+        )
 
         return parser
 
diff --git a/local_testing/run_all.py b/local_testing/run_all.py
new file mode 100755
index 00000000..19856b00
--- /dev/null
+++ b/local_testing/run_all.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""
+Script to run the pipeline for all Local Authorities.
+"""
+
+import sys
+import subprocess
+import time
+import json
+from pathlib import Path
+from datetime import datetime
+from cli import CLI
+
+
+def main():
+    """Run pipeline for all Local Authorities."""
+    # Get limit from command line if provided
+    limit = None
+    if len(sys.argv) > 1:
+        limit = sys.argv[1]
+
+    # Fetch all endpoints
+    print("Fetching endpoint list...")
+    endpoints = CLI.fetch_endpoint_list()
+    print(f"Found {len(endpoints)} Local Authorities")
+    print(f"Running with Polars comparison enabled\n")
+
+    success_count = 0
+    error_count = 0
+    errors = []
+    la_times = []
+    batch_start = time.time()
+
+    for i, ep in enumerate(endpoints, 1):
+        la = ep.get("local_authority", "Unknown")
+        print(f"\n{'='*60}")
+        print(f"[{i}/{len(endpoints)}] Processing: {la}")
+        print(f"{'='*60}")
+
+        # Build command with --compare flag for Polars
+        cmd = [sys.executable, "main.py", "--la", la, "--compare"]
+        if limit:
+            cmd.extend(["--limit", limit])
+
+        # Time this LA
+        la_start = time.time()
+        result = subprocess.run(cmd)
+        la_duration = time.time() - la_start
+
+        if result.returncode != 0:
+            print(f"  ⚠️  Error processing {la}")
+            error_count += 1
+            errors.append(la)
+            la_times.append({"la": la, "duration": la_duration, "status": "error"})
+        else:
+            print(f"  ✅ Completed {la} ({la_duration:.1f}s)")
+            success_count += 1
+            la_times.append({"la": la, "duration": la_duration, "status": "success"})
+
+    # Calculate batch metrics
+    batch_duration = time.time() - batch_start
+    avg_duration = sum(t["duration"] for t in la_times) / len(la_times) if la_times else 0
+    successful_times = [t["duration"] for t in la_times if t["status"] == "success"]
+    
+    # Summary
+    print(f"\n{'='*60}")
+    print("BATCH PROCESSING COMPLETE (with Polars Comparison)")
+    print(f"{'='*60}")
+    print(f"  Total LAs:       {len(endpoints)}")
+    print(f"  Success:         {success_count}")
+    print(f"  Errors:          {error_count}")
+    print(f"  Total Time:      {batch_duration:.1f}s ({batch_duration/60:.1f}m)")
+    print(f"  Avg Time/LA:     {avg_duration:.1f}s")
+    if successful_times:
+        print(f"  Min Time:        {min(successful_times):.1f}s")
+        print(f"  Max Time:        {max(successful_times):.1f}s")
+    print(f"\n  Note: All LAs processed with both Original + Polars pipelines")
+
+    if errors:
+        print(f"\nFailed Local Authorities:")
+        for la in errors:
+            print(f"  - {la}")
+
+    # Save batch report
+    reports_dir = Path(__file__).parent / "reports"
+    reports_dir.mkdir(parents=True, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    
+    batch_report = {
+        "batch_timestamp": timestamp,
+        "total_las": len(endpoints),
+        "success_count": success_count,
+        "error_count": error_count,
+        "batch_duration_seconds": batch_duration,
+        "average_duration_seconds": avg_duration,
+        "polars_comparison_enabled": True,
+        "limit": limit,
+        "la_results": la_times,
+        "errors": errors
+    }
+    
+    batch_json = reports_dir / f"batch_{timestamp}_summary.json"
+    with open(batch_json, "w") as f:
+        json.dump(batch_report, f, indent=2)
+    
+    print(f"\nBatch report saved: {batch_json}")
+    print(f"{'='*60}\n")
+
+    return 1 if error_count > 0 else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From bfeb931e2ba2dc34cfcbe20b0797cfdfa49e4974 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Fri, 6 Feb 2026 11:42:23 +0000
Subject: [PATCH 10/15] =?UTF-8?q?feat:=20update=20pipeline=20configuration?=
 =?UTF-8?q?=20files=20and=20add=20README=20for=20local=20testing=20environ?=
 =?UTF-8?q?ment=20Rapid=20local=20performance=20test=20environment=20suppo?=
 =?UTF-8?q?rting=20the=20Polars=E2=80=91based=20transformation=20rewrite?=
 =?UTF-8?q?=20in=20digital-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                          |   3 +-
 local_testing/README.md             | 311 ++++++++++++++++++++++++++++
 local_testing/pipeline/column.csv   |  11 +
 local_testing/pipeline/combine.csv  |   1 +
 local_testing/pipeline/concat.csv   |   1 +
 local_testing/pipeline/convert.csv  |   1 +
 local_testing/pipeline/default.csv  |   2 +
 local_testing/pipeline/filter.csv   |   1 +
 local_testing/pipeline/lookup.csv   |  11 +
 local_testing/pipeline/migrate.csv  |   1 +
 local_testing/pipeline/patch.csv    |   1 +
 local_testing/pipeline/redirect.csv |   1 +
 local_testing/pipeline/skip.csv     |   1 +
 13 files changed, 344 insertions(+), 2 deletions(-)
 create mode 100644 local_testing/README.md
 create mode 100644 local_testing/pipeline/column.csv
 create mode 100644 local_testing/pipeline/combine.csv
 create mode 100644 local_testing/pipeline/concat.csv
 create mode 100644 local_testing/pipeline/convert.csv
 create mode 100644 local_testing/pipeline/default.csv
 create mode 100644 local_testing/pipeline/filter.csv
 create mode 100644 local_testing/pipeline/lookup.csv
 create mode 100644 local_testing/pipeline/migrate.csv
 create mode 100644 local_testing/pipeline/patch.csv
 create mode 100644 local_testing/pipeline/redirect.csv
 create mode 100644 local_testing/pipeline/skip.csv

diff --git a/.gitignore b/.gitignore
index ea7b0f32..d65ec89a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,7 +43,6 @@ notebooks/.ipynb_checkpoints
 /local_testing/converted/
 /local_testing/extracted/
 /local_testing/output/
-/local_testing/pipeline/
 /local_testing/polars_phases/
 /local_testing/raw/
 /local_testing/reports/
@@ -53,4 +52,4 @@ notebooks/.ipynb_checkpoints
 /local_testing/main.py
 /local_testing/pipeline_report.py
 /local_testing/pipeline_runner.py
-/local_testing/README.md
+
diff --git a/local_testing/README.md b/local_testing/README.md
new file mode 100644
index 00000000..41de9085
--- /dev/null
+++ b/local_testing/README.md
@@ -0,0 +1,311 @@
+# Digital Land Pipeline - Local Testing
+
+A modular, self-contained environment for testing the digital-land transformation pipeline
+on various datasets (e.g., UK Land Registry title-boundary data).
+
+## Architecture
+
+The pipeline uses a **clean modular architecture** with 8 specialized classes:
+
+1. **CLI** (121 lines) - Command-line interface and argument parsing
+2. **FileDownloader** (95 lines) - Downloads GML files from data sources
+3. **GMLExtractor** (50 lines) - Extracts GML from ZIP archives
+4. **GMLConverter** (458 lines) - Converts GML to CSV/Parquet (4 strategies)
+5. **PipelineConfig** (93 lines) - Manages pipeline configuration files
+6. **PipelineRunner** (254 lines) - Executes 26-phase digital-land transformation
+7. **PipelineReport** (346 lines) - Performance tracking and reporting
+8. **main.py** (265 lines) - Orchestrates the pipeline by calling specialized classes
+
+**Total**: 2,449 lines across 9 focused, testable modules (down from 1,688 monolithic lines)
+
+## Prerequisites
+
+### Specification Files
+
+The pipeline requires specification files from the digital-land specification repository. These files define schemas, fields, datatypes, and pipeline configurations.
+
+**Files Used (11 of 25):**
+- `dataset.csv`, `schema.csv`, `dataset-schema.csv`
+- `datatype.csv`, `field.csv`, `dataset-field.csv`, `schema-field.csv`
+- `typology.csv`, `pipeline.csv`, `licence.csv`, `provision-rule.csv`
+
+The remaining 14 files are not loaded by the pipeline but may be used by other digital-land tools.
+
+## Quick Start
+
+**Using Makefile (Recommended):**
+
+```bash
+# Navigate to directory
+cd digital-land-python/local_testing
+
+# First time setup - automatically creates directories, installs dependencies, and clones specification
+make init
+
+# If specification already exists elsewhere, you can symlink it instead:
+# cd digital-land-python
+# ln -s /path/to/your/specification specification
+
+# Verify setup
+make check-spec
+
+# List available Local Authorities (for title-boundary dataset)
+make list
+
+# Process a specific LA (includes Polars comparison automatically)
+make run LA="Buckinghamshire"
+
+# Process with record limit
+make run LA="Buckinghamshire" LIMIT=100
+
+# Process ALL Local Authorities (batch mode with comparison)
+make run-all
+
+# Process all with record limit (for testing)
+make run-all LIMIT=100
+
+# Run only specific phases (e.g., phases 1,2,9)
+make run LA="Buckinghamshire" PHASES="1,2,9"
+
+# Run range of phases (e.g., phases 1-5 and 9)
+make run LA="Buckinghamshire" PHASES="1-5,9"
+
+# Use best performance (DuckDB + Parquet, includes comparison)
+make fast LA="Buckinghamshire"
+
+# See all available commands
+make help
+
+# Note: All run commands automatically include Polars comparison
+```
+
+**Manual Setup (Alternative):**
+
+```bash
+# Navigate to local testing directory
+cd digital-land-python/local_testing
+
+# Create virtual environment (first time only)
+python3 -m venv venv
+
+# Activate virtual environment
+source venv/bin/activate
+
+# Install dependencies (first time only)
+pip install polars duckdb
+
+# List available items (dataset-specific)
+python main.py --list
+
+# Process a specific item
+python main.py --la "Buckinghamshire"
+
+# Process with record limit (for testing)
+python main.py --la "Buckinghamshire" --limit 100
+
+# Skip download if already have the file
+python main.py --la "Buckinghamshire" --skip-download
+
+# Use DuckDB with Parquet for best performance
+python main.py --la "Buckinghamshire" --use-duckdb --use-parquet
+```
+
+## What It Does
+
+The pipeline performs 5 steps:
+
+1. **Download** (FileDownloader) - Fetches data files from source API
+2. **Extract** (GMLExtractor) - Unzips and locates GML files  
+3. **Convert** (GMLConverter) - Parses GML and converts to CSV/Parquet (4 methods available)
+4. **Transform** (PipelineRunner) - Runs full 26-phase digital-land pipeline
+5. **Report** (PipelineReport) - Generates performance report (JSON + text)
+
+Each step delegates to a specialized class for clean separation of concerns.
+
+## Directory Structure
+
+```
+local_testing/
+├── main.py              # Main orchestration (265 lines)
+├── cli.py               # Command-line interface (121 lines)
+├── file_downloader.py   # Downloads GML files (95 lines)
+├── gml_extractor.py     # ZIP extraction (50 lines)
+├── gml_converter.py     # GML conversion (458 lines)
+├── pipeline_config.py   # Config management (93 lines)
+├── pipeline_runner.py   # 26-phase transformation (254 lines)
+├── pipeline_report.py   # Performance tracking (346 lines)
+├── polars_phases.py     # Polars-optimized phases (767 lines)
+├── Makefile             # Make commands for easy setup and running
+├── README.md            # This file
+├── .gitignore           # Git ignore file
+├── venv/                # Virtual environment (created with: make init)
+├── raw/                 # Downloaded ZIP files
+├── extracted/           # Extracted GML files
+├── converted/           # GML converted to CSV/Parquet
+├── output/              # Pipeline output (harmonised + facts)
+├── reports/             # Performance reports
+├── cache/               # Organisation.csv cache
+├── pipeline/            # Pipeline configuration CSVs
+├── specification/       # digital-land specification files
+└── scripts/             # Helper scripts
+```
+
+## Module Overview
+
+### CLI (`cli.py`)
+- Argument parsing with `argparse`
+- Fetches endpoint list from GitHub
+- Lists and matches data items
+- Clean separation of UI logic
+
+### FileDownloader (`file_downloader.py`)
+- Downloads files from APIs
+- Progress tracking with byte counts
+- Reusable for any file download needs
+
+### GMLExtractor (`gml_extractor.py`)
+- Extracts GML files from ZIP archives
+- Handles nested directory structures
+- Simple, focused responsibility
+
+### GMLConverter (`gml_converter.py`)
+- **4 conversion strategies**:
+  1. Regex → CSV (default, no dependencies)
+  2. Regex → Parquet (Polars)
+  3. DuckDB → CSV (spatial extension)
+  4. DuckDB → Parquet (fastest, best)
+- Parses GML polygons to WKT
+- Handles coordinate transformation
+
+### PipelineConfig (`pipeline_config.py`)
+- Creates pipeline configuration CSVs
+- Downloads organization.csv
+- Ensures all config files exist
+
+### PipelineRunner (`pipeline_runner.py`)
+- Executes 26-phase digital-land pipeline
+- Lazy imports for fast startup
+- Per-phase timing and metrics
+- Handles Parquet/CSV input
+
+### PipelineReport (`pipeline_report.py`)
+- Tracks step and phase metrics
+- Generates JSON and text reports
+- Calculates durations and throughput
+- Supports comparison reporting
+
+## Output Files
+
+After running the pipeline, you will find:
+
+**Pipeline Output:**
+- `output/{name}_harmonised.csv` - Intermediate harmonised data
+- `output/{name}_facts.csv` - Final fact table output  
+- `output/{name}_issues.csv` - Any issues logged during processing
+
+**Performance Reports:**
+
+1. **Single LA Report** (default)
+   - `reports/{name}_{timestamp}_performance.json` - Detailed JSON report
+   - `reports/{name}_{timestamp}_performance.txt` - Human-readable text report
+   - Shows timing for all 26 phases
+
+2. **Selective Phase Report** (when using `--phases`)
+   - Same format as above
+   - Only includes metrics for selected phases
+   - Useful for testing specific transformations
+
+3. **Batch Summary Report** (when using `make run-all`)
+   - `reports/batch_{timestamp}_summary.json` - Aggregate metrics for entire batch
+   - Includes total time, per-LA timing, success/error counts
+   - Shows min/max/average processing times across all LAs
+   - **All run commands now include automatic Polars comparison** (both Original + Polars pipelines)
+
+## Command Line Options
+
+| Option | Description |
+|--------|-------------|
+| `--la NAME` | Item name (partial match) |
+| `--limit N` | Limit number of records to process |
+| `--skip-download` | Use existing downloaded data |
+| `--list` | List all available items |
+| `--use-duckdb` | Use DuckDB with spatial extension for GML conversion (faster, proper CRS transform) |
+| `--use-parquet` | Output Parquet instead of CSV (faster reads, smaller files) |
+| `--phases` | Run specific phases (e.g., `1,2,9` or `1-5,9`) |
+| `--compare` | Run both original and Polars pipelines for comparison (enabled by default in Makefile) |
+
+## GML Conversion Methods
+
+The **GMLConverter** class supports multiple conversion strategies:
+
+### Output Formats
+
+| Format | Flag | Advantages |
+|--------|------|------------|
+| **CSV** | (default) | Universal, human-readable |
+| **Parquet** | `--use-parquet` | 3-10x smaller, faster reads, preserves types |
+
+### Conversion Engines
+
+| Engine | Flag | Speed | Features |
+|--------|------|-------|----------|
+| **Regex** | (default) | Slow | No dependencies |
+| **DuckDB** | `--use-duckdb` | Fast | Proper CRS transform, spatial extension |
+
+### Best Performance
+
+For the fastest conversion, use DuckDB with Parquet output:
+
+```bash
+# Best performance: DuckDB → Parquet
+python main.py --la "Buckinghamshire" --use-duckdb --use-parquet
+```
+
+## Testing the Modular Architecture
+
+All classes are independently testable:
+
+```bash
+# Navigate to directory
+cd digital-land-python/local_testing
+
+# Activate venv
+source venv/bin/activate
+
+# Verify all modules work
+python3 -c "
+from cli import CLI
+from file_downloader import FileDownloader
+from gml_extractor import GMLExtractor
+from gml_converter import GMLConverter
+from pipeline_config import PipelineConfig
+from pipeline_runner import PipelineRunner
+from pipeline_report import PipelineReport
+print('✅ All modules imported successfully')
+"
+```
+
+## Notes
+
+- Virtual environment should be created in `local_testing/venv/`
+- Entity assignment requires a lookup table (`pipeline/lookup.csv`)
+- Without lookups, harmonised data will have empty entity field
+- Facts output will be empty without entity lookups
+- Coordinates are converted from OSGB (EPSG:27700) to WGS84
+- Requirements: `pip install polars duckdb`
+- Parquet uses Snappy compression by default
+- Add `venv/` to `.gitignore` to avoid committing virtual environment
+- **Reusable for other datasets** - Just update the endpoint URL in CLI or main.py
+
+## Development
+
+Each module can be modified independently:
+
+- **Add new conversion method**: Edit `GMLConverter.convert_to_*()` methods
+- **Change CLI options**: Edit `CLI.create_parser()`
+- **Add new pipeline phases**: Edit `PipelineRunner.run_full_pipeline()`
+- **Modify reporting**: Edit `PipelineReport` metrics and output formats
+- **Add new data sources**: Create new downloader classes following `FileDownloader` pattern
+- **Adapt for new datasets**: Update endpoint URLs and field mappings in relevant classes
+
+The modular structure makes it easy to extend and test each component in isolation.
diff --git a/local_testing/pipeline/column.csv b/local_testing/pipeline/column.csv
new file mode 100644
index 00000000..ee2a9062
--- /dev/null
+++ b/local_testing/pipeline/column.csv
@@ -0,0 +1,11 @@
+dataset,resource,column,field
+title-boundary,,reference,reference
+title-boundary,,name,name
+title-boundary,,geometry,geometry
+title-boundary,,start-date,start-date
+title-boundary,,entry-date,entry-date
+title-boundary,,end-date,end-date
+title-boundary,,prefix,prefix
+title-boundary,,organisation,organisation
+title-boundary,,notes,notes
+title-boundary,,national-cadastral-reference,notes
diff --git a/local_testing/pipeline/combine.csv b/local_testing/pipeline/combine.csv
new file mode 100644
index 00000000..cae5fefa
--- /dev/null
+++ b/local_testing/pipeline/combine.csv
@@ -0,0 +1 @@
+dataset,resource,field,fields,separator
diff --git a/local_testing/pipeline/concat.csv b/local_testing/pipeline/concat.csv
new file mode 100644
index 00000000..cae5fefa
--- /dev/null
+++ b/local_testing/pipeline/concat.csv
@@ -0,0 +1 @@
+dataset,resource,field,fields,separator
diff --git a/local_testing/pipeline/convert.csv b/local_testing/pipeline/convert.csv
new file mode 100644
index 00000000..926bf51e
--- /dev/null
+++ b/local_testing/pipeline/convert.csv
@@ -0,0 +1 @@
+dataset,resource,field,value,replacement
diff --git a/local_testing/pipeline/default.csv b/local_testing/pipeline/default.csv
new file mode 100644
index 00000000..8f30d573
--- /dev/null
+++ b/local_testing/pipeline/default.csv
@@ -0,0 +1,2 @@
+dataset,resource,field,default-field,entry-date
+title-boundary,,,entry-date,
diff --git a/local_testing/pipeline/filter.csv b/local_testing/pipeline/filter.csv
new file mode 100644
index 00000000..a9802699
--- /dev/null
+++ b/local_testing/pipeline/filter.csv
@@ -0,0 +1 @@
+dataset,resource,field,pattern
diff --git a/local_testing/pipeline/lookup.csv b/local_testing/pipeline/lookup.csv
new file mode 100644
index 00000000..fee3f4f0
--- /dev/null
+++ b/local_testing/pipeline/lookup.csv
@@ -0,0 +1,11 @@
+prefix,resource,organisation,reference,entity
+title-boundary,,,33205373,12000000001
+title-boundary,,,60898175,12000000002
+title-boundary,,,33209075,12000000003
+title-boundary,,,55955680,12000000004
+title-boundary,,,37316451,12000000005
+title-boundary,,,26291037,12000000006
+title-boundary,,,30556652,12000000007
+title-boundary,,,42046003,12000000008
+title-boundary,,,32896399,12000000009
+title-boundary,,,42173303,12000000010
diff --git a/local_testing/pipeline/migrate.csv b/local_testing/pipeline/migrate.csv
new file mode 100644
index 00000000..728e7bbc
--- /dev/null
+++ b/local_testing/pipeline/migrate.csv
@@ -0,0 +1 @@
+dataset,old-field,new-field
diff --git a/local_testing/pipeline/patch.csv b/local_testing/pipeline/patch.csv
new file mode 100644
index 00000000..478c396a
--- /dev/null
+++ b/local_testing/pipeline/patch.csv
@@ -0,0 +1 @@
+dataset,resource,field,pattern,value
diff --git a/local_testing/pipeline/redirect.csv b/local_testing/pipeline/redirect.csv
new file mode 100644
index 00000000..d3d9f670
--- /dev/null
+++ b/local_testing/pipeline/redirect.csv
@@ -0,0 +1 @@
+entity,status,redirect-entity
diff --git a/local_testing/pipeline/skip.csv b/local_testing/pipeline/skip.csv
new file mode 100644
index 00000000..d5f3eaff
--- /dev/null
+++ b/local_testing/pipeline/skip.csv
@@ -0,0 +1 @@
+dataset,resource,pattern

From 1ef4f09cca398bb8b1856d58cb65d30780670d4b Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Fri, 6 Feb 2026 11:43:56 +0000
Subject: [PATCH 11/15] =?UTF-8?q?Refactor=20pipeline=20scripts:=20remove?=
 =?UTF-8?q?=20old=20main.py,=20pipeline=5Freport.py,=20and=20pipeline=5Fru?=
 =?UTF-8?q?nner.py;=20add=20new=20implementations=20for=20main=20pipeline?=
 =?UTF-8?q?=20orchestration=20and=20reporting=20Rapid=20local=20performanc?=
 =?UTF-8?q?e=20test=20environment=20supporting=20the=20Polars=E2=80=91base?=
 =?UTF-8?q?d=20transformation=20rewrite=20in=20digital-land-python=20Fixes?=
 =?UTF-8?q?=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                       |   4 -
 local_testing/main.py            | 365 +++++++++++++++++++++++
 local_testing/pipeline_report.py | 477 +++++++++++++++++++++++++++++++
 local_testing/pipeline_runner.py | 444 ++++++++++++++++++++++++++++
 4 files changed, 1286 insertions(+), 4 deletions(-)
 create mode 100644 local_testing/main.py
 create mode 100644 local_testing/pipeline_report.py
 create mode 100644 local_testing/pipeline_runner.py

diff --git a/.gitignore b/.gitignore
index d65ec89a..578c0446 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,7 +49,3 @@ notebooks/.ipynb_checkpoints
 /local_testing/specification/
 /local_testing/venv/
 
-/local_testing/main.py
-/local_testing/pipeline_report.py
-/local_testing/pipeline_runner.py
-
diff --git a/local_testing/main.py b/local_testing/main.py
new file mode 100644
index 00000000..34b5c586
--- /dev/null
+++ b/local_testing/main.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python3
+"""
+Title Boundary Pipeline - Download, Convert, and Transform GML data from Land Registry
+
+Orchestration script that coordinates multiple specialized classes.
+"""
+
+import sys
+import time
+from pathlib import Path
+from datetime import datetime
+
+from cli import CLI
+from file_downloader import FileDownloader
+from gml_extractor import GMLExtractor
+from gml_converter import GMLConverter
+from pipeline_config import PipelineConfig
+from pipeline_runner import PipelineRunner
+from pipeline_report import PipelineReport
+
+
+# =============================================================================
+# Constants
+# =============================================================================
+
+SCRIPT_DIR = Path(__file__).parent.resolve()
+DATASET = "title-boundary"
+
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+
+def parse_phase_selection(phases_str: str) -> set:
+    """
+    Parse phase selection string into set of phase numbers.
+    
+    Args:
+        phases_str: Comma-separated phase numbers or ranges (e.g., "1,2,9" or "1-5,9")
+        
+    Returns:
+        Set of selected phase numbers, or None if invalid
+    """
+    phases = set()
+    try:
+        for part in phases_str.split(","):
+            part = part.strip()
+            if "-" in part:
+                # Range: "1-5"
+                start, end = part.split("-")
+                phases.update(range(int(start), int(end) + 1))
+            else:
+                # Single phase: "9"
+                phases.add(int(part))
+        
+        # Validate phase numbers (1-26)
+        if any(p < 1 or p > 26 for p in phases):
+            return None
+        
+        return phases
+    except (ValueError, AttributeError):
+        return None
+
+
+# =============================================================================
+# Main Entry Point
+# =============================================================================
+
+
+def main():
+    """Main entry point for title-boundary pipeline."""
+
+    # Parse arguments using CLI class
+    parser = CLI.create_parser()
+    args = parser.parse_args()
+
+    # Setup directories
+    raw_dir = SCRIPT_DIR / "raw"
+    extracted_dir = SCRIPT_DIR / "extracted"
+    converted_dir = SCRIPT_DIR / "converted"
+    output_dir = SCRIPT_DIR / "output"
+    pipeline_dir = SCRIPT_DIR / "pipeline"
+    specification_dir = SCRIPT_DIR.parent / "specification"
+    cache_dir = SCRIPT_DIR / "cache"
+    reports_dir = SCRIPT_DIR / "reports"
+
+    for directory in [
+        raw_dir,
+        extracted_dir,
+        converted_dir,
+        output_dir,
+        pipeline_dir,
+        cache_dir,
+        reports_dir,
+    ]:
+        directory.mkdir(parents=True, exist_ok=True)
+
+    # List mode - use CLI class
+    if args.list or not args.la:
+        CLI.list_available_las()
+        if not args.la:
+            print("Use --la 'Name' to process a specific Local Authority")
+        return 0
+
+    # Find matching LA - use CLI class
+    endpoint, la_name = CLI.find_matching_la(args.la)
+    if not endpoint:
+        return 1
+
+    # Initialize
+    la_slug = la_name.lower().replace(" ", "_").replace(",", "")
+    report = PipelineReport()
+    report.local_authority = la_name
+    report.dataset = DATASET
+
+    # Print header
+    print(f"\n{'='*60}")
+    print("Title Boundary Pipeline")
+    print(f"{'='*60}")
+    print(f"Local Authority: {la_name}")
+    print(f"Endpoint: {endpoint['url']}")
+    if args.limit:
+        print(f"Limit: {args.limit:,} records")
+    print(f"{'='*60}\n")
+
+    overall_start = time.time()
+
+    # =========================================================================
+    # Step 1: Download - use FileDownloader class
+    # =========================================================================
+    print("Step 1: Download")
+    print("-" * 40)
+
+    step_download = report.add_step("Download")
+    zip_path = raw_dir / f"{la_slug}.zip"
+
+    if args.skip_download and zip_path.exists():
+        print(f"  Using existing: {zip_path}")
+        step_download.mark_complete(success=True)
+    else:
+        downloader = FileDownloader()
+        success = downloader.download_file(endpoint["url"], zip_path)
+        step_download.mark_complete(success=success)
+        if not success:
+            print("  Download failed")
+            return 1
+
+    if zip_path.exists():
+        report.zip_size_mb = zip_path.stat().st_size / (1024 * 1024)
+
+    # =========================================================================
+    # Step 2: Extract - use GMLExtractor class
+    # =========================================================================
+    print("\nStep 2: Extract")
+    print("-" * 40)
+
+    step_extract = report.add_step("Extract")
+    extract_subdir = extracted_dir / la_slug
+
+    try:
+        gml_path = GMLExtractor.extract_gml_from_zip(zip_path, extract_subdir)
+        step_extract.mark_complete(success=True)
+
+        if gml_path.exists():
+            report.gml_size_mb = gml_path.stat().st_size / (1024 * 1024)
+    except Exception as e:
+        print(f"  Extraction failed: {e}")
+        step_extract.mark_complete(success=False)
+        return 1
+
+    # =========================================================================
+    # Step 3: Convert - use GMLConverter class
+    # =========================================================================
+    output_format = "Parquet" if args.use_parquet else "CSV"
+    print(f"\nStep 3: Convert GML to {output_format}")
+    print("-" * 40)
+
+    step_convert = report.add_step("Convert")
+    converter = GMLConverter()
+
+    # Choose conversion method based on arguments
+    if args.use_duckdb and args.use_parquet:
+        method = "DuckDB+Parquet"
+        output_path = converted_dir / f"{la_slug}.parquet"
+        record_count = converter.convert_to_parquet_duckdb(
+            gml_path, output_path, limit=args.limit
+        )
+    elif args.use_duckdb:
+        method = "DuckDB+CSV"
+        output_path = converted_dir / f"{la_slug}.csv"
+        record_count = converter.convert_to_csv_duckdb(
+            gml_path, output_path, limit=args.limit
+        )
+    elif args.use_parquet:
+        method = "Polars+Parquet"
+        output_path = converted_dir / f"{la_slug}.parquet"
+        record_count = converter.convert_to_parquet(
+            gml_path, output_path, limit=args.limit
+        )
+    else:
+        method = "Polars+CSV"
+        output_path = converted_dir / f"{la_slug}.csv"
+        record_count = converter.convert_to_csv(gml_path, output_path, limit=args.limit)
+
+    step_convert.mark_complete(
+        success=record_count > 0, record_count=record_count, method=method
+    )
+
+    if record_count == 0:
+        print("  Conversion produced no records")
+        return 1
+
+    report.input_records = record_count
+
+    # =========================================================================
+    # Step 4: Transform - use PipelineConfig and PipelineRunner classes
+    # =========================================================================
+    print("\nStep 4: Transform through Pipeline")
+    print("-" * 40)
+
+    step_transform = report.add_step("Transform")
+
+    # Ensure configuration exists using PipelineConfig class
+    PipelineConfig.ensure_pipeline_config(pipeline_dir)
+
+    if not specification_dir.exists():
+        print(f"  Error: Specification directory not found: {specification_dir}")
+        print(f"  Please clone specification to: {specification_dir}")
+        step_transform.mark_complete(success=False)
+        return 1
+
+    # Parse phase selection if provided
+    selected_phases = None
+    if args.phases:
+        selected_phases = parse_phase_selection(args.phases)
+        if selected_phases:
+            print(f"  Running selected phases: {sorted(selected_phases)}")
+            report.selected_phases = selected_phases  # Store in report for filtering
+        else:
+            print(f"  Invalid phase selection: {args.phases}")
+            step_transform.mark_complete(success=False)
+            return 1
+
+    # Run pipeline using PipelineRunner class
+    runner = PipelineRunner(dataset=DATASET)
+    results = runner.run_full_pipeline(
+        input_csv=output_path,
+        output_dir=output_dir,
+        specification_dir=specification_dir,
+        pipeline_dir=pipeline_dir,
+        cache_dir=cache_dir,
+        la_name=la_name,
+        report=report,
+        selected_phases=selected_phases,
+    )
+
+    step_transform.mark_complete(
+        success=True,
+        harmonised_records=results["harmonised"],
+        fact_records=results["facts"],
+        transform_time=results.get("transform_time", 0),
+    )
+    
+    # Run Polars pipeline for comparison if requested
+    if args.compare:
+        print("\n  Running Polars pipeline for comparison...")
+        from polars_phases import run_polars_pipeline, PolarsPhaseMetrics
+        
+        # Define required parameters
+        field_datatype_map = {"geometry": "text"}  # Simplified for now
+        intermediate_fieldnames = ["entity", "name", "geometry", "organisation"]
+        factor_fieldnames = ["entity", "fact"]
+        
+        polars_harmonised = output_dir / f"{la_name}_polars_harmonised.csv"
+        polars_facts = output_dir / f"{la_name}_polars_facts.csv"
+        
+        polars_start = time.time()
+        polars_metrics, polars_harm_count, polars_fact_count = run_polars_pipeline(
+            input_csv=output_path,
+            harmonised_csv=polars_harmonised,
+            facts_csv=polars_facts,
+            field_datatype_map=field_datatype_map,
+            intermediate_fieldnames=intermediate_fieldnames,
+            factor_fieldnames=factor_fieldnames,
+            dataset=DATASET,
+            selected_phases=selected_phases,  # Pass phase selection to Polars
+        )
+        polars_end = time.time()
+        
+        # Store Polars metrics in report
+        report.polars_phases = []
+        for metric in polars_metrics:
+            from pipeline_report import PhaseMetrics
+            phase_metric = PhaseMetrics(
+                name=metric.name,
+                phase_number=metric.phase_number,
+                start_time=0,
+                end_time=0,
+                duration_seconds=metric.duration_seconds,
+                input_count=metric.input_count,
+                output_count=metric.output_count,
+            )
+            report.polars_phases.append(phase_metric)
+        
+        report.polars_harmonised_records = polars_harm_count
+        report.polars_fact_records = polars_fact_count
+        report.polars_transform_seconds = polars_end - polars_start
+        
+        speedup = results.get("transform_time", 0) / report.polars_transform_seconds if report.polars_transform_seconds > 0 else 0
+        print(f"  Polars transform time: {report.polars_transform_seconds:.3f}s")
+        print(f"  Speedup: {speedup:.1f}x faster")
+
+    # =========================================================================
+    # Step 5: Generate Report - use PipelineReport class
+    # =========================================================================
+    overall_end = time.time()
+    report.total_duration_seconds = overall_end - overall_start
+    report.calculate_totals()
+
+    print("\nStep 5: Generate Performance Report")
+    print("-" * 40)
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    json_path = reports_dir / f"{la_slug}_{timestamp}_performance.json"
+    text_path = reports_dir / f"{la_slug}_{timestamp}_performance.txt"
+
+    report.save_json(json_path)
+    report.save_text(text_path)
+
+    print(f"  JSON report: {json_path}")
+    print(f"  Text report: {text_path}")
+
+    # =========================================================================
+    # Summary
+    # =========================================================================
+    print(f"\n{'='*60}")
+    print("PIPELINE COMPLETE")
+    print(f"{'='*60}")
+    print(f"Local Authority: {la_name}")
+    print(f"Dataset: {DATASET}")
+    print(f"Total Duration: {report.total_duration_seconds:.2f}s")
+    print(f"Input Records: {report.input_records:,}")
+    print(f"Harmonised Records: {report.harmonised_records:,}")
+    print(f"Fact Records: {report.fact_records:,}")
+
+    if report.steps:
+        print(f"\nStep Summary:")
+        for name, step in report.steps.items():
+            status = "✓" if step.success else "✗"
+            print(f"  {status} {name:<20} {step.duration_seconds:8.3f}s")
+
+    if report.phases:
+        total_phase_time = sum(p.duration_seconds for p in report.phases)
+        print(
+            f"\nTransform Phases: {len(report.phases)} phases, {total_phase_time:.3f}s total"
+        )
+
+    print(f"{'='*60}\n")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main() or 0)
diff --git a/local_testing/pipeline_report.py b/local_testing/pipeline_report.py
new file mode 100644
index 00000000..cb4b9c26
--- /dev/null
+++ b/local_testing/pipeline_report.py
@@ -0,0 +1,477 @@
+"""
+Performance reporting and metrics tracking for pipeline runs.
+
+Provides classes to track timing, resource usage, and comparison
+metrics for original vs Polars pipeline implementations.
+"""
+
+import sys
+import time
+import platform as plat
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass, field
+
+
+@dataclass
+class PhaseMetrics:
+    """Metrics for a single pipeline phase."""
+
+    name: str
+    phase_number: int
+    start_time: float = 0.0
+    end_time: float = 0.0
+    duration_seconds: float = 0.0
+    input_count: int = 0
+    output_count: int = 0
+
+    def complete(self, output_count: int = 0):
+        """Mark phase as complete and calculate duration."""
+        self.end_time = time.time()
+        self.duration_seconds = self.end_time - self.start_time
+        self.output_count = output_count
+
+
+@dataclass
+class StepMetrics:
+    """Metrics for a pipeline step (Download, Extract, Convert, Transform)."""
+
+    name: str
+    start_time: float = 0.0
+    end_time: float = 0.0
+    duration_seconds: float = 0.0
+    success: bool = True
+    details: Dict[str, Any] = field(default_factory=dict)
+
+    def start(self):
+        """Start timing this step."""
+        self.start_time = time.time()
+
+    def complete(self, **details):
+        """Mark step as complete."""
+        self.end_time = time.time()
+        self.duration_seconds = self.end_time - self.start_time
+        self.details.update(details)
+
+    def mark_complete(self, success: bool = True, **details):
+        """Mark step as complete with success status."""
+        self.end_time = time.time()
+        self.duration_seconds = self.end_time - self.start_time
+        self.success = success
+        self.details.update(details)
+
+
+@dataclass
+class PipelineReport:
+    """Complete performance report for a pipeline run."""
+
+    # Run metadata
+    run_id: str = ""
+    timestamp: str = ""
+    local_authority: str = ""
+    dataset: str = "title-boundary"
+    record_limit: Optional[int] = None
+
+    # Input/Output metrics
+    input_records: int = 0
+    harmonised_records: int = 0
+    fact_records: int = 0
+
+    # Polars comparison metrics
+    polars_harmonised_records: int = 0
+    polars_fact_records: int = 0
+    polars_phases: List[PhaseMetrics] = field(default_factory=list)
+    polars_transform_seconds: float = 0.0
+
+    # File sizes
+    zip_size_mb: float = 0.0
+    gml_size_mb: float = 0.0
+    csv_size_mb: float = 0.0
+
+    # Step timings
+    steps: Dict[str, StepMetrics] = field(default_factory=dict)
+
+    # Phase timings (transformation only)
+    phases: List[PhaseMetrics] = field(default_factory=list)
+
+    # Phase selection (if running specific phases)
+    selected_phases: Optional[set] = None
+
+    # Total timing
+    total_duration_seconds: float = 0.0
+    transform_duration_seconds: float = 0.0
+
+    # System info
+    python_version: str = ""
+    platform: str = ""
+
+    def __post_init__(self):
+        """Initialize run metadata."""
+        self.python_version = sys.version.split()[0]
+        self.platform = f"{plat.system()} {plat.release()}"
+        self.run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
+        self.timestamp = datetime.now().isoformat()
+
+    def add_step(self, name: str) -> StepMetrics:
+        """Add and start a new step."""
+        step = StepMetrics(name=name)
+        step.start()
+        self.steps[name] = step
+        return step
+
+    def add_phase(self, name: str, phase_number: int) -> PhaseMetrics:
+        """Add a new phase."""
+        phase = PhaseMetrics(
+            name=name, phase_number=phase_number, start_time=time.time()
+        )
+        self.phases.append(phase)
+        return phase
+
+    def calculate_totals(self):
+        """Calculate total durations."""
+        self.total_duration_seconds = sum(
+            s.duration_seconds for s in self.steps.values()
+        )
+        self.transform_duration_seconds = sum(p.duration_seconds for p in self.phases)
+
+    def to_dict(self) -> Dict:
+        """Convert to dictionary for JSON serialization."""
+        # Filter phases if selection is active
+        phases_to_output = self.phases
+        polars_phases_to_output = self.polars_phases
+        if self.selected_phases:
+            phases_to_output = [p for p in self.phases if p.phase_number in self.selected_phases]
+            polars_phases_to_output = [p for p in self.polars_phases if p.phase_number in self.selected_phases]
+        
+        return {
+            "run_id": self.run_id,
+            "timestamp": self.timestamp,
+            "local_authority": self.local_authority,
+            "dataset": self.dataset,
+            "record_limit": self.record_limit,
+            "selected_phases": list(sorted(self.selected_phases)) if self.selected_phases else None,
+            "input_records": self.input_records,
+            "harmonised_records": self.harmonised_records,
+            "fact_records": self.fact_records,
+            "file_sizes": {
+                "zip_mb": self.zip_size_mb,
+                "gml_mb": self.gml_size_mb,
+                "csv_mb": self.csv_size_mb,
+            },
+            "timing": {
+                "total_seconds": self.total_duration_seconds,
+                "transform_seconds": self.transform_duration_seconds,
+                "polars_transform_seconds": self.polars_transform_seconds,
+                "speedup_factor": (
+                    (self.transform_duration_seconds / self.polars_transform_seconds)
+                    if self.polars_transform_seconds > 0
+                    else 0
+                ),
+                "steps": {
+                    name: {"duration_seconds": s.duration_seconds, **s.details}
+                    for name, s in self.steps.items()
+                },
+                "phases": [
+                    {
+                        "number": p.phase_number,
+                        "name": p.name,
+                        "duration_seconds": p.duration_seconds,
+                        "output_count": p.output_count,
+                    }
+                    for p in phases_to_output
+                ],
+                "polars_phases": [
+                    {
+                        "number": p.phase_number,
+                        "name": p.name,
+                        "duration_seconds": p.duration_seconds,
+                        "output_count": p.output_count,
+                    }
+                    for p in polars_phases_to_output
+                ],
+            },
+            "comparison": {
+                "original_transform_seconds": self.transform_duration_seconds,
+                "polars_transform_seconds": self.polars_transform_seconds,
+                "speedup_factor": (
+                    (self.transform_duration_seconds / self.polars_transform_seconds)
+                    if self.polars_transform_seconds > 0
+                    else 0
+                ),
+                "time_saved_seconds": self.transform_duration_seconds
+                - self.polars_transform_seconds,
+            },
+            "system": {
+                "python_version": self.python_version,
+                "platform": self.platform,
+            },
+        }
+
+    def generate_text_report(self) -> str:
+        """Generate human-readable text report."""
+        lines = []
+        lines.append("=" * 100)
+        lines.append("TITLE BOUNDARY PIPELINE - PERFORMANCE REPORT")
+        lines.append("=" * 100)
+        lines.append("")
+        lines.append(f"Run ID:          {self.run_id}")
+        lines.append(f"Timestamp:       {self.timestamp}")
+        lines.append(f"Local Authority: {self.local_authority}")
+        lines.append(f"Dataset:         {self.dataset}")
+        lines.append(f"Record Limit:    {self.record_limit or 'None (all records)'}")
+        lines.append("")
+
+        lines.append("-" * 100)
+        lines.append("INPUT/OUTPUT SUMMARY")
+        lines.append("-" * 100)
+        lines.append(f"Input Records:      {self.input_records:,}")
+        if self.polars_phases:
+            lines.append(
+                f"Harmonised Records: {self.harmonised_records:,} (Original) / {self.polars_harmonised_records:,} (Polars)"
+            )
+            lines.append(
+                f"Fact Records:       {self.fact_records:,} (Original) / {self.polars_fact_records:,} (Polars)"
+            )
+        else:
+            lines.append(f"Harmonised Records: {self.harmonised_records:,}")
+            lines.append(f"Fact Records:       {self.fact_records:,}")
+        lines.append("")
+
+        lines.append("-" * 100)
+        lines.append("FILE SIZES")
+        lines.append("-" * 100)
+        lines.append(f"ZIP File:  {self.zip_size_mb:,.2f} MB")
+        lines.append(f"GML File:  {self.gml_size_mb:,.2f} MB")
+        lines.append(f"CSV File:  {self.csv_size_mb:,.2f} MB")
+        lines.append("")
+
+        lines.append("-" * 100)
+        lines.append("STEP TIMING SUMMARY")
+        lines.append("-" * 100)
+        lines.append(f"{'Step':<20} {'Duration':>12} {'% of Total':>12}")
+        lines.append("-" * 44)
+        for name, step in self.steps.items():
+            pct = (
+                (step.duration_seconds / self.total_duration_seconds * 100)
+                if self.total_duration_seconds > 0
+                else 0
+            )
+            lines.append(f"{name:<20} {step.duration_seconds:>10.3f}s {pct:>10.1f}%")
+        lines.append("-" * 44)
+        lines.append(
+            f"{'TOTAL':<20} {self.total_duration_seconds:>10.3f}s {100.0:>10.1f}%"
+        )
+        lines.append("")
+
+        # COMBINED PHASE COMPARISON TABLE (if Polars was run)
+        if self.polars_phases:
+            lines.append("=" * 100)
+            lines.append("PHASE-BY-PHASE COMPARISON: ORIGINAL vs POLARS")
+            lines.append("=" * 100)
+            
+            # Show phase selection info if applicable
+            if self.selected_phases:
+                lines.append(f"Running selected phases: {sorted(self.selected_phases)}")
+            lines.append("")
+
+            # Header
+            lines.append(
+                f"{'#':<3} {'Phase Name':<26} {'Original':>11} {'Polars':>11} {'Speedup':>10} {'Time Saved':>12} {'Orig Out':>10} {'Polars Out':>10}"
+            )
+            lines.append("-" * 100)
+
+            # Build lookup for Polars phases by name
+            polars_by_name = {p.name: p for p in self.polars_phases}
+
+            # Filter phases if selection is active
+            phases_to_display = self.phases
+            if self.selected_phases:
+                phases_to_display = [p for p in self.phases if p.phase_number in self.selected_phases]
+
+            total_original = 0.0
+            total_polars = 0.0
+            total_saved = 0.0
+
+            for phase in phases_to_display:
+                polars_phase = polars_by_name.get(phase.name)
+                if polars_phase:
+                    if polars_phase.duration_seconds > 0:
+                        speedup = phase.duration_seconds / polars_phase.duration_seconds
+                    else:
+                        speedup = float("inf") if phase.duration_seconds > 0 else 1.0
+
+                    saved = phase.duration_seconds - polars_phase.duration_seconds
+                    speedup_str = f"{speedup:.1f}x" if speedup != float("inf") else "∞"
+
+                    lines.append(
+                        f"{phase.phase_number:<3} {phase.name:<26} {phase.duration_seconds:>9.4f}s {polars_phase.duration_seconds:>9.4f}s {speedup_str:>9} {saved:>10.4f}s {phase.output_count:>10,} {polars_phase.output_count:>10,}"
+                    )
+
+                    total_original += phase.duration_seconds
+                    total_polars += polars_phase.duration_seconds
+                    total_saved += saved
+                else:
+                    lines.append(
+                        f"{phase.phase_number:<3} {phase.name:<26} {phase.duration_seconds:>9.4f}s {'N/A':>11} {'N/A':>9} {'N/A':>12} {phase.output_count:>10,} {'N/A':>10}"
+                    )
+
+            lines.append("-" * 100)
+            overall_speedup = total_original / total_polars if total_polars > 0 else 0
+            lines.append(
+                f"{'':3} {'TOTAL TRANSFORM TIME':<26} {total_original:>9.4f}s {total_polars:>9.4f}s {overall_speedup:>8.1f}x {total_saved:>10.4f}s"
+            )
+            lines.append("")
+
+            # Overall summary
+            lines.append("-" * 100)
+            lines.append("PERFORMANCE SUMMARY")
+            lines.append("-" * 100)
+            lines.append(f"Original Pipeline:  {total_original:.4f}s")
+            lines.append(f"Polars Pipeline:    {total_polars:.4f}s")
+            lines.append(f"Speedup Factor:     {overall_speedup:.1f}x faster")
+            lines.append(
+                f"Time Saved:         {total_saved:.4f}s ({(total_saved/total_original*100):.1f}% reduction)"
+            )
+            lines.append("")
+
+        else:
+            lines.append("-" * 100)
+            lines.append("ORIGINAL PIPELINE - PHASE TIMING (Row-by-Row)")
+            lines.append("-" * 100)
+            
+            # Show phase selection info if applicable
+            if self.selected_phases:
+                lines.append(f"Running selected phases: {sorted(self.selected_phases)}")
+                lines.append("")
+            
+            lines.append(
+                f"{'#':<4} {'Phase Name':<30} {'Duration':>12} {'% of Transform':>14} {'Output':>10}"
+            )
+            lines.append("-" * 74)
+
+            # Filter phases if selection is active
+            phases_to_display = self.phases
+            if self.selected_phases:
+                phases_to_display = [p for p in self.phases if p.phase_number in self.selected_phases]
+
+            for phase in phases_to_display:
+                pct = (
+                    (phase.duration_seconds / self.transform_duration_seconds * 100)
+                    if self.transform_duration_seconds > 0
+                    else 0
+                )
+                lines.append(
+                    f"{phase.phase_number:<4} {phase.name:<30} {phase.duration_seconds:>10.4f}s {pct:>12.1f}% {phase.output_count:>10,}"
+                )
+
+            lines.append("-" * 74)
+            lines.append(
+                f"{'':4} {'TOTAL TRANSFORM TIME':<30} {self.transform_duration_seconds:>10.4f}s {100.0:>12.1f}%"
+            )
+            lines.append("")
+
+        # Top 5 slowest phases (Original)
+        lines.append("-" * 100)
+        lines.append("TOP 5 SLOWEST PHASES (Original Pipeline)")
+        lines.append("-" * 100)
+        
+        # Filter phases for "top slowest" if selection is active
+        phases_for_top5 = self.phases
+        if self.selected_phases:
+            phases_for_top5 = [p for p in self.phases if p.phase_number in self.selected_phases]
+        
+        sorted_phases = sorted(
+            phases_for_top5, key=lambda x: x.duration_seconds, reverse=True
+        )[:5]
+        for i, phase in enumerate(sorted_phases, 1):
+            pct = (
+                (phase.duration_seconds / self.transform_duration_seconds * 100)
+                if self.transform_duration_seconds > 0
+                else 0
+            )
+            lines.append(
+                f"  {i}. {phase.name:<30} {phase.duration_seconds:>10.4f}s ({pct:.1f}%)"
+            )
+        lines.append("")
+
+        # TOP SPEEDUP WINNERS (if Polars was run)
+        if self.polars_phases:
+            lines.append("-" * 100)
+            lines.append("TOP 5 SPEEDUP WINNERS (Biggest Improvements with Polars)")
+            lines.append("-" * 100)
+
+            # Filter phases for speedup calculation if selection is active
+            phases_for_speedup = self.phases
+            if self.selected_phases:
+                phases_for_speedup = [p for p in self.phases if p.phase_number in self.selected_phases]
+
+            polars_by_name = {p.name: p for p in self.polars_phases}
+            speedups = []
+            for phase in phases_for_speedup:
+                polars_phase = polars_by_name.get(phase.name)
+                if polars_phase and phase.duration_seconds > 0.0001:
+                    if polars_phase.duration_seconds > 0:
+                        speedup = phase.duration_seconds / polars_phase.duration_seconds
+                    else:
+                        speedup = float("inf")
+                    saved = phase.duration_seconds - polars_phase.duration_seconds
+                    speedups.append(
+                        (
+                            phase.name,
+                            phase.duration_seconds,
+                            polars_phase.duration_seconds,
+                            speedup,
+                            saved,
+                        )
+                    )
+
+            speedups.sort(key=lambda x: x[4], reverse=True)
+
+            for i, (name, orig, polars, spd, saved) in enumerate(speedups[:5], 1):
+                spd_str = f"{spd:.1f}x" if spd != float("inf") else "∞"
+                lines.append(
+                    f"  {i}. {name:<26} {orig:.4f}s → {polars:.4f}s  ({spd_str} faster, {saved:.4f}s saved)"
+                )
+            lines.append("")
+
+        # THROUGHPUT METRICS
+        if (
+            self.polars_phases
+            and self.input_records > 0
+            and self.transform_duration_seconds > 0
+            and self.polars_transform_seconds > 0
+        ):
+            lines.append("-" * 100)
+            lines.append("THROUGHPUT METRICS")
+            lines.append("-" * 100)
+            orig_throughput = self.input_records / self.transform_duration_seconds
+            polars_throughput = self.input_records / self.polars_transform_seconds
+            lines.append(f"Original Pipeline:  {orig_throughput:,.0f} records/second")
+            lines.append(f"Polars Pipeline:    {polars_throughput:,.0f} records/second")
+            lines.append(
+                f"Throughput Gain:    {polars_throughput - orig_throughput:,.0f} records/second faster"
+            )
+            lines.append("")
+
+        lines.append("=" * 100)
+        lines.append(
+            f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+        )
+        lines.append("=" * 100)
+
+        return "\n".join(lines)
+
+    def save_json(self, path: Path):
+        """Save report as JSON file."""
+        import json
+        
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with open(path, 'w') as f:
+            json.dump(self.to_dict(), f, indent=2)
+    
+    def save_text(self, path: Path):
+        """Save report as text file."""
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with open(path, 'w') as f:
+            f.write(self.generate_text_report())
diff --git a/local_testing/pipeline_runner.py b/local_testing/pipeline_runner.py
new file mode 100644
index 00000000..3f69ccfd
--- /dev/null
+++ b/local_testing/pipeline_runner.py
@@ -0,0 +1,444 @@
+"""
+Pipeline execution engine for title-boundary dataset.
+
+Handles running the full 26-phase digital-land transformation pipeline
+with detailed timing and progress tracking.
+"""
+
+import time
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, Optional
+
+from pipeline_config import PipelineConfig
+
+
+class PipelineRunner:
+    """Executes the digital-land transformation pipeline with timing."""
+
+    def __init__(self, dataset: str = "title-boundary"):
+        """
+        Initialize pipeline runner.
+
+        Args:
+            dataset: Name of the dataset being processed
+        """
+        self.dataset = dataset
+        self.pipeline_imports = None
+
+    def get_pipeline_imports(self):
+        """
+        Lazy import of digital-land pipeline modules.
+
+        Returns dict of imported classes and functions.
+        """
+        if self.pipeline_imports is not None:
+            return self.pipeline_imports
+
+        from digital_land.phase.convert import ConvertPhase
+        from digital_land.phase.normalise import NormalisePhase
+        from digital_land.phase.parse import ParsePhase
+        from digital_land.phase.concat import ConcatFieldPhase
+        from digital_land.phase.filter import FilterPhase
+        from digital_land.phase.map import MapPhase
+        from digital_land.phase.patch import PatchPhase
+        from digital_land.phase.harmonise import HarmonisePhase
+        from digital_land.phase.default import DefaultPhase
+        from digital_land.phase.migrate import MigratePhase
+        from digital_land.phase.organisation import OrganisationPhase
+        from digital_land.phase.prune import (
+            FieldPrunePhase,
+            EntityPrunePhase,
+            FactPrunePhase,
+        )
+        from digital_land.phase.reference import (
+            EntityReferencePhase,
+            FactReferencePhase,
+        )
+        from digital_land.phase.prefix import EntityPrefixPhase
+        from digital_land.phase.lookup import EntityLookupPhase, FactLookupPhase
+        from digital_land.phase.priority import PriorityPhase
+        from digital_land.phase.pivot import PivotPhase
+        from digital_land.phase.combine import FactCombinePhase
+        from digital_land.phase.factor import FactorPhase
+        from digital_land.phase.save import SavePhase
+        from digital_land.pipeline.main import Pipeline
+        from digital_land.specification import Specification
+        from digital_land.organisation import Organisation
+        from digital_land.log import (
+            IssueLog,
+            ColumnFieldLog,
+            DatasetResourceLog,
+            OperationalIssueLog,
+            ConvertedResourceLog,
+        )
+        from digital_land.api import API
+
+        self.pipeline_imports = {
+            "ConvertPhase": ConvertPhase,
+            "NormalisePhase": NormalisePhase,
+            "ParsePhase": ParsePhase,
+            "ConcatFieldPhase": ConcatFieldPhase,
+            "FilterPhase": FilterPhase,
+            "MapPhase": MapPhase,
+            "PatchPhase": PatchPhase,
+            "HarmonisePhase": HarmonisePhase,
+            "DefaultPhase": DefaultPhase,
+            "MigratePhase": MigratePhase,
+            "OrganisationPhase": OrganisationPhase,
+            "FieldPrunePhase": FieldPrunePhase,
+            "EntityPrunePhase": EntityPrunePhase,
+            "FactPrunePhase": FactPrunePhase,
+            "EntityReferencePhase": EntityReferencePhase,
+            "FactReferencePhase": FactReferencePhase,
+            "EntityPrefixPhase": EntityPrefixPhase,
+            "EntityLookupPhase": EntityLookupPhase,
+            "FactLookupPhase": FactLookupPhase,
+            "PriorityPhase": PriorityPhase,
+            "PivotPhase": PivotPhase,
+            "FactCombinePhase": FactCombinePhase,
+            "FactorPhase": FactorPhase,
+            "SavePhase": SavePhase,
+            "Pipeline": Pipeline,
+            "Specification": Specification,
+            "Organisation": Organisation,
+            "IssueLog": IssueLog,
+            "ColumnFieldLog": ColumnFieldLog,
+            "DatasetResourceLog": DatasetResourceLog,
+            "OperationalIssueLog": OperationalIssueLog,
+            "ConvertedResourceLog": ConvertedResourceLog,
+            "API": API,
+        }
+
+        return self.pipeline_imports
+
+    def run_full_pipeline(
+        self,
+        input_csv: Path,
+        output_dir: Path,
+        specification_dir: Path,
+        pipeline_dir: Path,
+        cache_dir: Path,
+        la_name: str,
+        report=None,
+        selected_phases=None,
+    ) -> Dict:
+        """
+        Run the full 26-phase digital-land transformation pipeline.
+
+        Args:
+            input_csv: Path to input CSV/Parquet file
+            output_dir: Directory for output files
+            specification_dir: Directory containing specification files
+            pipeline_dir: Directory containing pipeline configuration
+            cache_dir: Directory for cached resources
+            la_name: Local Authority name/slug
+            report: Optional PipelineReport instance for metrics tracking
+            selected_phases: Optional set of phase numbers (1-26) to run
+
+        Returns:
+            Dict with results including file paths and record counts
+        """
+        print("  Loading digital-land pipeline modules...")
+        p = self.get_pipeline_imports()
+
+        # Convert Parquet to CSV if needed (original pipeline only supports CSV)
+        if input_csv.suffix.lower() == ".parquet":
+            import polars as pl
+
+            csv_input = input_csv.with_suffix(".csv")
+            if not csv_input.exists():
+                print(f"  Converting Parquet to CSV for original pipeline...")
+                pl.read_parquet(input_csv).write_csv(csv_input)
+            input_csv = csv_input
+
+        # Set up output paths
+        harmonised_csv = output_dir / f"{la_name}_harmonised.csv"
+        facts_csv = output_dir / f"{la_name}_facts.csv"
+        issue_csv = output_dir / f"{la_name}_issues.csv"
+
+        print(f"  Input: {input_csv}")
+        print(f"  Harmonised: {harmonised_csv}")
+        print(f"  Facts: {facts_csv}")
+
+        # Load configuration
+        specification = p["Specification"](str(specification_dir))
+        pipeline = p["Pipeline"](
+            str(pipeline_dir), self.dataset, specification=specification
+        )
+        schema = specification.pipeline.get(pipeline.name, {}).get(
+            "schema", self.dataset
+        )
+        intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
+        factor_fieldnames = specification.factor_fieldnames()
+
+        # Create logs
+        resource = la_name.lower().replace(" ", "_")
+        issue_log = p["IssueLog"](dataset=self.dataset, resource=resource)
+        operational_issue_log = p["OperationalIssueLog"](
+            dataset=self.dataset, resource=resource
+        )
+        column_field_log = p["ColumnFieldLog"](dataset=self.dataset, resource=resource)
+        dataset_resource_log = p["DatasetResourceLog"](
+            dataset=self.dataset, resource=resource
+        )
+        converted_resource_log = p["ConvertedResourceLog"](
+            dataset=self.dataset, resource=resource
+        )
+
+        # Load organization data
+        org_csv = PipelineConfig.download_organisation_csv(cache_dir)
+        organisation = p["Organisation"](
+            organisation_path=str(org_csv), pipeline_dir=Path(pipeline_dir)
+        )
+        api = p["API"](specification=specification)
+
+        # Get configuration
+        entity_range_min = specification.get_dataset_entity_min(self.dataset)
+        entity_range_max = specification.get_dataset_entity_max(self.dataset)
+        endpoints = []
+        organisations_list = ["government-organisation:D2"]
+        entry_date = datetime.now().strftime("%Y-%m-%d")
+
+        # Get pipeline configuration
+        skip_patterns = pipeline.skip_patterns(resource, endpoints)
+        columns = pipeline.columns(resource, endpoints=endpoints)
+        concats = pipeline.concatenations(resource, endpoints=endpoints)
+        patches = pipeline.patches(resource=resource, endpoints=endpoints)
+        lookups = pipeline.lookups(resource=resource)
+        default_fields = pipeline.default_fields(resource=resource, endpoints=endpoints)
+        default_values = pipeline.default_values(endpoints=endpoints)
+        combine_fields = pipeline.combine_fields(endpoints=endpoints)
+        redirect_lookups = pipeline.redirect_lookups()
+        migrations = pipeline.migrations()
+        config = None
+        valid_category_values = api.get_valid_category_values(self.dataset, pipeline)
+
+        if len(organisations_list) == 1:
+            default_values["organisation"] = organisations_list[0]
+        if entry_date and "entry-date" not in default_values:
+            default_values["entry-date"] = entry_date
+
+        field_datatype_map = specification.get_field_datatype_map()
+        field_typology_map = specification.get_field_typology_map()
+        field_prefix_map = specification.get_field_prefix_map()
+        dataset_prefix = specification.dataset_prefix(self.dataset)
+
+        print("  Running 26-phase pipeline with per-phase timing...")
+
+        # Define phase creators
+        phase_creators = [
+            (
+                1,
+                "ConvertPhase",
+                lambda: p["ConvertPhase"](
+                    path=str(input_csv),
+                    dataset_resource_log=dataset_resource_log,
+                    converted_resource_log=converted_resource_log,
+                ),
+            ),
+            (
+                2,
+                "NormalisePhase",
+                lambda: p["NormalisePhase"](skip_patterns=skip_patterns),
+            ),
+            (3, "ParsePhase", lambda: p["ParsePhase"]()),
+            (
+                4,
+                "ConcatFieldPhase",
+                lambda: p["ConcatFieldPhase"](concats=concats, log=column_field_log),
+            ),
+            (
+                5,
+                "FilterPhase-1",
+                lambda: p["FilterPhase"](filters=pipeline.filters(resource)),
+            ),
+            (
+                6,
+                "MapPhase",
+                lambda: p["MapPhase"](
+                    fieldnames=intermediate_fieldnames,
+                    columns=columns,
+                    log=column_field_log,
+                ),
+            ),
+            (
+                7,
+                "FilterPhase-2",
+                lambda: p["FilterPhase"](
+                    filters=pipeline.filters(resource, endpoints=endpoints)
+                ),
+            ),
+            (
+                8,
+                "PatchPhase",
+                lambda: p["PatchPhase"](issues=issue_log, patches=patches),
+            ),
+            (
+                9,
+                "HarmonisePhase",
+                lambda: p["HarmonisePhase"](
+                    field_datatype_map=field_datatype_map,
+                    issues=issue_log,
+                    dataset=self.dataset,
+                    valid_category_values=valid_category_values,
+                ),
+            ),
+            (
+                10,
+                "DefaultPhase",
+                lambda: p["DefaultPhase"](
+                    default_fields=default_fields,
+                    default_values=default_values,
+                    issues=issue_log,
+                ),
+            ),
+            (
+                11,
+                "MigratePhase",
+                lambda: p["MigratePhase"](
+                    fields=specification.schema_field[schema], migrations=migrations
+                ),
+            ),
+            (
+                12,
+                "OrganisationPhase",
+                lambda: p["OrganisationPhase"](
+                    organisation=organisation, issues=issue_log
+                ),
+            ),
+            (
+                13,
+                "FieldPrunePhase",
+                lambda: p["FieldPrunePhase"](
+                    fields=specification.current_fieldnames(schema)
+                ),
+            ),
+            (
+                14,
+                "EntityReferencePhase",
+                lambda: p["EntityReferencePhase"](
+                    dataset=self.dataset, prefix=dataset_prefix, issues=issue_log
+                ),
+            ),
+            (
+                15,
+                "EntityPrefixPhase",
+                lambda: p["EntityPrefixPhase"](dataset=self.dataset),
+            ),
+            (
+                16,
+                "EntityLookupPhase",
+                lambda: p["EntityLookupPhase"](
+                    lookups=lookups,
+                    redirect_lookups=redirect_lookups,
+                    issue_log=issue_log,
+                    operational_issue_log=operational_issue_log,
+                    entity_range=[entity_range_min, entity_range_max],
+                ),
+            ),
+            (
+                17,
+                "SavePhase-harmonised",
+                lambda: p["SavePhase"](
+                    str(harmonised_csv),
+                    fieldnames=intermediate_fieldnames,
+                    enabled=True,
+                ),
+            ),
+            (
+                18,
+                "EntityPrunePhase",
+                lambda: p["EntityPrunePhase"](
+                    dataset_resource_log=dataset_resource_log
+                ),
+            ),
+            (
+                19,
+                "PriorityPhase",
+                lambda: p["PriorityPhase"](config=config, providers=organisations_list),
+            ),
+            (20, "PivotPhase", lambda: p["PivotPhase"]()),
+            (
+                21,
+                "FactCombinePhase",
+                lambda: p["FactCombinePhase"](
+                    issue_log=issue_log, fields=combine_fields
+                ),
+            ),
+            (22, "FactorPhase", lambda: p["FactorPhase"]()),
+            (
+                23,
+                "FactReferencePhase",
+                lambda: p["FactReferencePhase"](
+                    field_typology_map=field_typology_map,
+                    field_prefix_map=field_prefix_map,
+                ),
+            ),
+            (
+                24,
+                "FactLookupPhase",
+                lambda: p["FactLookupPhase"](
+                    lookups=lookups,
+                    redirect_lookups=redirect_lookups,
+                    issue_log=issue_log,
+                    odp_collections=specification.get_odp_collections(),
+                ),
+            ),
+            (25, "FactPrunePhase", lambda: p["FactPrunePhase"]()),
+            (
+                26,
+                "SavePhase-facts",
+                lambda: p["SavePhase"](str(facts_csv), fieldnames=factor_fieldnames),
+            ),
+        ]
+
+        # Run phases with timing
+        stream_data = []
+        total_start = time.time()
+
+        for phase_num, phase_name, phase_creator in phase_creators:
+            phase = phase_creator()
+            phase_start = time.time()
+
+            if phase_num == 1:
+                output_stream = phase.process(iter([]))
+            else:
+                output_stream = phase.process(iter(stream_data))
+
+            stream_data = list(output_stream)
+            duration = time.time() - phase_start
+            output_count = len(stream_data)
+
+            if report:
+                metrics = report.add_phase(phase_name, phase_num)
+                metrics.duration_seconds = duration
+                metrics.output_count = output_count
+
+            if duration > 0.1:
+                print(
+                    f"    Phase {phase_num:2d}: {phase_name:<25} {duration:8.4f}s  ({output_count:,} rows)"
+                )
+
+        total_transform_time = time.time() - total_start
+        print(f"  Total transform time: {total_transform_time:.3f}s")
+
+        # Count results
+        harmonised_count = (
+            sum(1 for _ in open(harmonised_csv)) - 1 if harmonised_csv.exists() else 0
+        )
+        facts_count = sum(1 for _ in open(facts_csv)) - 1 if facts_csv.exists() else 0
+        issue_log.save(str(issue_csv))
+
+        if report:
+            report.harmonised_records = harmonised_count
+            report.fact_records = facts_count
+
+        return {
+            "harmonised": harmonised_count,
+            "facts": facts_count,
+            "harmonised_path": str(harmonised_csv),
+            "facts_path": str(facts_csv),
+            "issues_path": str(issue_csv),
+            "transform_time": total_transform_time,
+        }

From ecd0b4f9ab8921a5a8397a2ef6ae1a71a4a1e43d Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 9 Feb 2026 13:52:58 +0000
Subject: [PATCH 12/15] =?UTF-8?q?refactor:=20improve=20code=20formatting?=
 =?UTF-8?q?=20and=20readability=20across=20multiple=20files=20Rapid=20loca?=
 =?UTF-8?q?l=20performance=20test=20environment=20supporting=20the=20Polar?=
 =?UTF-8?q?s=E2=80=91based=20transformation=20rewrite=20in=20digital-land-?=
 =?UTF-8?q?python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 local_testing/file_downloader.py | 112 +++++++----
 local_testing/gml_converter.py   | 332 ++++++++++++++++++-------------
 local_testing/gml_extractor.py   |  26 +--
 local_testing/main.py            |  29 +--
 local_testing/pipeline_report.py |  48 +++--
 local_testing/run_all.py         |  14 +-
 6 files changed, 326 insertions(+), 235 deletions(-)

diff --git a/local_testing/file_downloader.py b/local_testing/file_downloader.py
index e13fc40b..d3bcb4ea 100644
--- a/local_testing/file_downloader.py
+++ b/local_testing/file_downloader.py
@@ -12,6 +12,7 @@
 
 try:
     import requests
+
     HAS_REQUESTS = True
 except ImportError:
     HAS_REQUESTS = False
@@ -32,23 +33,27 @@ def fetch_endpoint_list(self) -> List[dict]:
 
         req = urllib.request.Request(
             self.endpoint_csv_url,
-            headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'}
+            headers={
+                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
+            },
         )
-        
+
         with urllib.request.urlopen(req) as response:
-            content = response.read().decode('utf-8')
+            content = response.read().decode("utf-8")
             reader = csv.DictReader(content.splitlines())
-            
+
             endpoints = []
             for row in reader:
-                url = row.get('endpoint-url', '').strip()
+                url = row.get("endpoint-url", "").strip()
                 if url:
-                    endpoints.append({
-                        'endpoint': row.get('endpoint', ''),
-                        'url': url,
-                        'local_authority': self.get_la_name_from_url(url),
-                        'entry_date': row.get('entry-date', ''),
-                    })
+                    endpoints.append(
+                        {
+                            "endpoint": row.get("endpoint", ""),
+                            "url": url,
+                            "local_authority": self.get_la_name_from_url(url),
+                            "entry_date": row.get("entry-date", ""),
+                        }
+                    )
 
         print(f"  Found {len(endpoints)} endpoints")
         return endpoints
@@ -61,16 +66,27 @@ def get_la_name_from_url(url: str) -> str:
         if parts:
             filename = parts[-1].replace(".zip", "").replace("_", " ")
             # Remove common suffixes for cleaner names
-            for suffix in [" Council", " Borough Council", " City Council", " District Council", 
-                          " Metropolitan Borough Council", " County Council"]:
+            for suffix in [
+                " Council",
+                " Borough Council",
+                " City Council",
+                " District Council",
+                " Metropolitan Borough Council",
+                " County Council",
+            ]:
                 if filename.endswith(suffix):
-                    filename = filename[:-len(suffix)]
+                    filename = filename[: -len(suffix)]
                     break
             # Remove prefixes
-            for prefix in ["Borough of ", "City of ", "County of ", "Royal Borough of ",
-                          "London Borough of "]:
+            for prefix in [
+                "Borough of ",
+                "City of ",
+                "County of ",
+                "Royal Borough of ",
+                "London Borough of ",
+            ]:
                 if filename.startswith(prefix):
-                    filename = filename[len(prefix):]
+                    filename = filename[len(prefix) :]
                     break
             return filename.strip()
         return "Unknown"
@@ -99,58 +115,66 @@ def download_file(
             return self._download_with_requests(url, output_path, chunk_size)
         else:
             return self._download_with_urllib(url, output_path, chunk_size)
-    
-    def _download_with_requests(self, url: str, output_path: Path, chunk_size: int) -> Path:
+
+    def _download_with_requests(
+        self, url: str, output_path: Path, chunk_size: int
+    ) -> Path:
         """Download using requests library (handles redirects better)."""
         headers = {
-            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-            'Accept-Language': 'en-GB,en;q=0.9',
+            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "Accept-Language": "en-GB,en;q=0.9",
         }
-        
+
         session = requests.Session()
         session.headers.update(headers)
-        
+
         response = session.get(url, stream=True, allow_redirects=True, timeout=30)
         response.raise_for_status()
-        
-        total_size = int(response.headers.get('content-length', 0))
+
+        total_size = int(response.headers.get("content-length", 0))
         downloaded = 0
-        
-        with open(output_path, 'wb') as f:
+
+        with open(output_path, "wb") as f:
             for chunk in response.iter_content(chunk_size=chunk_size):
                 if chunk:
                     f.write(chunk)
                     downloaded += len(chunk)
-                    
+
                     if total_size > 0:
                         progress = (downloaded / total_size) * 100
                         mb_downloaded = downloaded / (1024 * 1024)
                         mb_total = total_size / (1024 * 1024)
-                        print(f"\r  Progress: {progress:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)", end="", flush=True)
-        
+                        print(
+                            f"\r  Progress: {progress:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)",
+                            end="",
+                            flush=True,
+                        )
+
         print()  # New line after progress
         print(f"  ✓ Downloaded {downloaded:,} bytes")
         return output_path
-    
-    def _download_with_urllib(self, url: str, output_path: Path, chunk_size: int) -> Path:
+
+    def _download_with_urllib(
+        self, url: str, output_path: Path, chunk_size: int
+    ) -> Path:
         """Download using urllib (fallback)."""
 
         # Add comprehensive browser headers to mimic real browser
         headers = {
-            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
-            'Accept-Language': 'en-GB,en;q=0.9',
-            'Accept-Encoding': 'gzip, deflate, br',
-            'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1',
-            'Sec-Fetch-Dest': 'document',
-            'Sec-Fetch-Mode': 'navigate',
-            'Sec-Fetch-Site': 'none',
+            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+            "Accept-Language": "en-GB,en;q=0.9",
+            "Accept-Encoding": "gzip, deflate, br",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "Sec-Fetch-Dest": "document",
+            "Sec-Fetch-Mode": "navigate",
+            "Sec-Fetch-Site": "none",
         }
-        
+
         req = urllib.request.Request(url, headers=headers)
-        
+
         with urllib.request.urlopen(req) as response:
             total_size = int(response.headers.get("content-length", 0))
             downloaded = 0
diff --git a/local_testing/gml_converter.py b/local_testing/gml_converter.py
index 4a3b4036..1ece02f1 100644
--- a/local_testing/gml_converter.py
+++ b/local_testing/gml_converter.py
@@ -15,44 +15,48 @@
 
 class GMLConverter:
     """Converts GML files to CSV/Parquet with multiple strategies."""
-    
+
     @staticmethod
     def extract_polygon_wkt(geometry_text: str) -> str:
         """
         Extract polygon coordinates and convert to WKT format.
-        
+
         Handles both exterior rings and interior rings (holes).
-        
+
         Args:
             geometry_text: GML geometry element text
-            
+
         Returns:
             WKT polygon string, or empty string if no valid geometry
         """
         exterior_match = re.search(
-            r'<gml:exterior>.*?<gml:posList>([^<]+)</gml:posList>.*?</gml:exterior>',
-            geometry_text, re.DOTALL
+            r"<gml:exterior>.*?<gml:posList>([^<]+)</gml:posList>.*?</gml:exterior>",
+            geometry_text,
+            re.DOTALL,
         )
-        
+
         if not exterior_match:
             return ""
-        
+
         exterior_coords_raw = exterior_match.group(1).strip().split()
         exterior_coords = []
         for i in range(0, len(exterior_coords_raw), 2):
             if i + 1 < len(exterior_coords_raw):
-                exterior_coords.append(f"{exterior_coords_raw[i]} {exterior_coords_raw[i+1]}")
-        
+                exterior_coords.append(
+                    f"{exterior_coords_raw[i]} {exterior_coords_raw[i+1]}"
+                )
+
         if not exterior_coords:
             return ""
-        
+
         # Extract interior rings (holes)
         interior_rings = []
         interior_matches = re.findall(
-            r'<gml:interior>.*?<gml:posList>([^<]+)</gml:posList>.*?</gml:interior>',
-            geometry_text, re.DOTALL
+            r"<gml:interior>.*?<gml:posList>([^<]+)</gml:posList>.*?</gml:interior>",
+            geometry_text,
+            re.DOTALL,
         )
-        
+
         for interior_coords_raw in interior_matches:
             coords = interior_coords_raw.strip().split()
             ring_coords = []
@@ -61,127 +65,149 @@ def extract_polygon_wkt(geometry_text: str) -> str:
                     ring_coords.append(f"{coords[i]} {coords[i+1]}")
             if ring_coords:
                 interior_rings.append(ring_coords)
-        
+
         exterior_wkt = f"({', '.join(exterior_coords)})"
         if interior_rings:
             interior_wkts = [f"({', '.join(ring)})" for ring in interior_rings]
             return f"POLYGON({exterior_wkt}, {', '.join(interior_wkts)})"
         return f"POLYGON({exterior_wkt})"
-    
+
     @staticmethod
     def extract_field(text: str, field_name: str) -> str:
         """
         Extract a field value from GML text.
-        
+
         Args:
             text: GML text to search
             field_name: Field name to extract
-            
+
         Returns:
             Field value, or empty string if not found
         """
-        pattern = f'<LR:{field_name}>([^<]+)</LR:{field_name}>'
+        pattern = f"<LR:{field_name}>([^<]+)</LR:{field_name}>"
         match = re.search(pattern, text)
         return match.group(1) if match else ""
-    
-    def convert_to_csv(self, gml_path: Path, csv_path: Path, limit: Optional[int] = None) -> int:
+
+    def convert_to_csv(
+        self, gml_path: Path, csv_path: Path, limit: Optional[int] = None
+    ) -> int:
         """
         Convert GML file to CSV format using regex parsing.
-        
+
         This is the baseline method - slower but doesn't require external dependencies.
-        
+
         Args:
             gml_path: Path to input GML file
             csv_path: Path to output CSV file
             limit: Optional limit on number of records to convert
-            
+
         Returns:
             Number of records converted
         """
         print(f"  Converting GML to CSV...")
         print(f"  Input:  {gml_path}")
         print(f"  Output: {csv_path}")
-        
+
         size_mb = gml_path.stat().st_size / (1024 * 1024)
         print(f"  GML size: {size_mb:.1f} MB")
-        
-        with open(gml_path, 'r', encoding='utf-8') as f:
+
+        with open(gml_path, "r", encoding="utf-8") as f:
             content = f.read()
-        
+
         # Find all cadastral parcel elements
-        pattern = r'<LR:PREDEFINED[^>]*>(.*?)</LR:PREDEFINED>'
+        pattern = r"<LR:PREDEFINED[^>]*>(.*?)</LR:PREDEFINED>"
         matches = re.findall(pattern, content, re.DOTALL)
         total_features = len(matches)
         print(f"  Found {total_features} cadastral parcels")
-        
+
         if limit:
             print(f"  Limiting to {limit} records")
-        
+
         fieldnames = [
-            'reference', 'name', 'national-cadastral-reference', 'geometry',
-            'start-date', 'entry-date', 'end-date', 'prefix', 'organisation', 'notes'
+            "reference",
+            "name",
+            "national-cadastral-reference",
+            "geometry",
+            "start-date",
+            "entry-date",
+            "end-date",
+            "prefix",
+            "organisation",
+            "notes",
         ]
-        
+
         csv_path.parent.mkdir(parents=True, exist_ok=True)
         count = 0
-        
-        with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
-            writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction='ignore')
+
+        with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
+            writer = csv.DictWriter(
+                csvfile, fieldnames=fieldnames, extrasaction="ignore"
+            )
             writer.writeheader()
-            
+
             for match in matches:
                 feature = {}
-                
-                inspire_id = self.extract_field(match, 'INSPIREID')
+
+                inspire_id = self.extract_field(match, "INSPIREID")
                 if inspire_id:
-                    feature['reference'] = inspire_id
-                    feature['name'] = inspire_id
-                
-                ncr = self.extract_field(match, 'NATIONALCADASTRALREFERENCE')
+                    feature["reference"] = inspire_id
+                    feature["name"] = inspire_id
+
+                ncr = self.extract_field(match, "NATIONALCADASTRALREFERENCE")
                 if ncr:
-                    feature['national-cadastral-reference'] = ncr
-                
-                valid_from = self.extract_field(match, 'VALIDFROM')
+                    feature["national-cadastral-reference"] = ncr
+
+                valid_from = self.extract_field(match, "VALIDFROM")
                 if valid_from:
-                    feature['start-date'] = valid_from.split('T')[0] if 'T' in valid_from else valid_from
-                
-                begin_lifespan = self.extract_field(match, 'BEGINLIFESPANVERSION')
+                    feature["start-date"] = (
+                        valid_from.split("T")[0] if "T" in valid_from else valid_from
+                    )
+
+                begin_lifespan = self.extract_field(match, "BEGINLIFESPANVERSION")
                 if begin_lifespan:
-                    feature['entry-date'] = begin_lifespan.split('T')[0] if 'T' in begin_lifespan else begin_lifespan
-                
-                geometry_match = re.search(r'<LR:GEOMETRY>(.*?)</LR:GEOMETRY>', match, re.DOTALL)
+                    feature["entry-date"] = (
+                        begin_lifespan.split("T")[0]
+                        if "T" in begin_lifespan
+                        else begin_lifespan
+                    )
+
+                geometry_match = re.search(
+                    r"<LR:GEOMETRY>(.*?)</LR:GEOMETRY>", match, re.DOTALL
+                )
                 if geometry_match:
                     wkt = self.extract_polygon_wkt(geometry_match.group(1))
                     if wkt:
-                        feature['geometry'] = wkt
-                
-                if 'reference' in feature:
-                    feature['prefix'] = 'title-boundary'
-                    feature['organisation'] = 'government-organisation:D2'
+                        feature["geometry"] = wkt
+
+                if "reference" in feature:
+                    feature["prefix"] = "title-boundary"
+                    feature["organisation"] = "government-organisation:D2"
                     writer.writerow(feature)
                     count += 1
-                    
+
                     if count % 5000 == 0:
                         print(f"    Converted {count}/{total_features} features...")
-                    
+
                     if limit and count >= limit:
                         break
-        
+
         print(f"  Converted {count} records to CSV")
         return count
-    
-    def convert_to_parquet(self, gml_path: Path, parquet_path: Path, limit: Optional[int] = None) -> int:
+
+    def convert_to_parquet(
+        self, gml_path: Path, parquet_path: Path, limit: Optional[int] = None
+    ) -> int:
         """
         Convert GML file to Parquet format using regex parsing + Polars.
-        
+
         Parquet is faster to read than CSV and preserves data types.
         Falls back to CSV if Polars is not installed.
-        
+
         Args:
             gml_path: Path to input GML file
             parquet_path: Path to output Parquet file
             limit: Optional limit on number of records to convert
-            
+
         Returns:
             Number of records converted
         """
@@ -190,86 +216,96 @@ def convert_to_parquet(self, gml_path: Path, parquet_path: Path, limit: Optional
         except ImportError:
             print("  Polars not installed. Install with: pip install polars")
             print("  Falling back to CSV...")
-            csv_path = parquet_path.with_suffix('.csv')
+            csv_path = parquet_path.with_suffix(".csv")
             return self.convert_to_csv(gml_path, csv_path, limit)
-        
+
         print(f"  Converting GML to Parquet...")
         print(f"  Input:  {gml_path}")
         print(f"  Output: {parquet_path}")
-        
+
         size_mb = gml_path.stat().st_size / (1024 * 1024)
         print(f"  GML size: {size_mb:.1f} MB")
-        
-        with open(gml_path, 'r', encoding='utf-8') as f:
+
+        with open(gml_path, "r", encoding="utf-8") as f:
             content = f.read()
-        
+
         # Find all cadastral parcel elements
-        pattern = r'<LR:PREDEFINED[^>]*>(.*?)</LR:PREDEFINED>'
+        pattern = r"<LR:PREDEFINED[^>]*>(.*?)</LR:PREDEFINED>"
         matches = re.findall(pattern, content, re.DOTALL)
         total_features = len(matches)
         print(f"  Found {total_features} cadastral parcels")
-        
+
         if limit:
             print(f"  Limiting to {limit} records")
             matches = matches[:limit]
-        
+
         # Build list of records
         records = []
         for match in matches:
             feature = {}
-            
-            inspire_id = self.extract_field(match, 'INSPIREID')
+
+            inspire_id = self.extract_field(match, "INSPIREID")
             if inspire_id:
-                feature['reference'] = inspire_id
-                feature['name'] = inspire_id
-            
-            ncr = self.extract_field(match, 'NATIONALCADASTRALREFERENCE')
+                feature["reference"] = inspire_id
+                feature["name"] = inspire_id
+
+            ncr = self.extract_field(match, "NATIONALCADASTRALREFERENCE")
             if ncr:
-                feature['national-cadastral-reference'] = ncr
-            
-            valid_from = self.extract_field(match, 'VALIDFROM')
+                feature["national-cadastral-reference"] = ncr
+
+            valid_from = self.extract_field(match, "VALIDFROM")
             if valid_from:
-                feature['start-date'] = valid_from.split('T')[0] if 'T' in valid_from else valid_from
-            
-            begin_lifespan = self.extract_field(match, 'BEGINLIFESPANVERSION')
+                feature["start-date"] = (
+                    valid_from.split("T")[0] if "T" in valid_from else valid_from
+                )
+
+            begin_lifespan = self.extract_field(match, "BEGINLIFESPANVERSION")
             if begin_lifespan:
-                feature['entry-date'] = begin_lifespan.split('T')[0] if 'T' in begin_lifespan else begin_lifespan
-            
-            geometry_match = re.search(r'<LR:GEOMETRY>(.*?)</LR:GEOMETRY>', match, re.DOTALL)
+                feature["entry-date"] = (
+                    begin_lifespan.split("T")[0]
+                    if "T" in begin_lifespan
+                    else begin_lifespan
+                )
+
+            geometry_match = re.search(
+                r"<LR:GEOMETRY>(.*?)</LR:GEOMETRY>", match, re.DOTALL
+            )
             if geometry_match:
                 wkt = self.extract_polygon_wkt(geometry_match.group(1))
                 if wkt:
-                    feature['geometry'] = wkt
-            
-            if 'reference' in feature:
-                feature['prefix'] = 'title-boundary'
-                feature['organisation'] = 'government-organisation:D2'
-                feature['end-date'] = None
-                feature['notes'] = None
+                    feature["geometry"] = wkt
+
+            if "reference" in feature:
+                feature["prefix"] = "title-boundary"
+                feature["organisation"] = "government-organisation:D2"
+                feature["end-date"] = None
+                feature["notes"] = None
                 records.append(feature)
-        
+
         # Create DataFrame and write to Parquet
         parquet_path.parent.mkdir(parents=True, exist_ok=True)
-        
+
         df = pl.DataFrame(records)
-        df.write_parquet(parquet_path, compression='snappy')
-        
+        df.write_parquet(parquet_path, compression="snappy")
+
         count = len(records)
         print(f"  Converted {count} records to Parquet")
         return count
-    
-    def convert_to_parquet_duckdb(self, gml_path: Path, parquet_path: Path, limit: Optional[int] = None) -> int:
+
+    def convert_to_parquet_duckdb(
+        self, gml_path: Path, parquet_path: Path, limit: Optional[int] = None
+    ) -> int:
         """
         Convert GML file to Parquet format using DuckDB with spatial extension.
-        
+
         This is the fastest method - DuckDB reads GML directly and writes Parquet.
         Falls back to Polars-based converter if DuckDB is not available.
-        
+
         Args:
             gml_path: Path to input GML file
             parquet_path: Path to output Parquet file
             limit: Optional limit on number of records to convert
-            
+
         Returns:
             Number of records converted
         """
@@ -279,16 +315,16 @@ def convert_to_parquet_duckdb(self, gml_path: Path, parquet_path: Path, limit: O
             print("  DuckDB not installed. Install with: pip install duckdb")
             print("  Falling back to Polars-based converter...")
             return self.convert_to_parquet(gml_path, parquet_path, limit)
-        
+
         print(f"  Converting GML to Parquet using DuckDB...")
         print(f"  Input:  {gml_path}")
         print(f"  Output: {parquet_path}")
-        
+
         size_mb = gml_path.stat().st_size / (1024 * 1024)
         print(f"  GML size: {size_mb:.1f} MB")
-        
+
         parquet_path.parent.mkdir(parents=True, exist_ok=True)
-        
+
         try:
             con = duckdb.connect()
             try:
@@ -299,10 +335,10 @@ def convert_to_parquet_duckdb(self, gml_path: Path, parquet_path: Path, limit: O
                 print("  Falling back to Polars-based converter...")
                 con.close()
                 return self.convert_to_parquet(gml_path, parquet_path, limit)
-            
+
             print("  Reading GML file...")
             limit_clause = f"LIMIT {limit}" if limit else ""
-            
+
             query = f"""
                 SELECT 
                     INSPIREID as reference,
@@ -327,48 +363,54 @@ def convert_to_parquet_duckdb(self, gml_path: Path, parquet_path: Path, limit: O
                 WHERE INSPIREID IS NOT NULL
                 {limit_clause}
             """
-            
+
             count_query = f"SELECT COUNT(*) FROM ST_Read('{gml_path}')"
             total_count = con.execute(count_query).fetchone()[0]
             print(f"  Found {total_count:,} cadastral parcels")
-            
+
             if limit:
                 print(f"  Limiting to {limit} records")
-            
+
             # Export directly to Parquet (much faster than CSV)
             print("  Transforming and writing to Parquet...")
-            con.execute(f"COPY ({query}) TO '{parquet_path}' (FORMAT PARQUET, COMPRESSION 'snappy')")
-            
+            con.execute(
+                f"COPY ({query}) TO '{parquet_path}' (FORMAT PARQUET, COMPRESSION 'snappy')"
+            )
+
             # Count output rows
-            result_count = con.execute(f"SELECT COUNT(*) FROM read_parquet('{parquet_path}')").fetchone()[0]
-            
+            result_count = con.execute(
+                f"SELECT COUNT(*) FROM read_parquet('{parquet_path}')"
+            ).fetchone()[0]
+
             con.close()
-            
+
             print(f"  Converted {result_count:,} records to Parquet")
             return result_count
-            
+
         except Exception as e:
             print(f"  DuckDB conversion failed: {e}")
             print("  Falling back to Polars-based converter...")
             return self.convert_to_parquet(gml_path, parquet_path, limit)
-    
-    def convert_to_csv_duckdb(self, gml_path: Path, csv_path: Path, limit: Optional[int] = None) -> int:
+
+    def convert_to_csv_duckdb(
+        self, gml_path: Path, csv_path: Path, limit: Optional[int] = None
+    ) -> int:
         """
         Convert GML file to CSV format using DuckDB with spatial extension.
-        
+
         This is significantly faster than regex parsing and properly handles:
         - Coordinate transformations (OSGB EPSG:27700 to WGS84 EPSG:4326)
         - Complex geometries (multi-polygons, holes)
         - Large files with streaming
-        
+
         Note: For even better performance, use convert_to_parquet_duckdb() instead.
         Falls back to regex-based converter if DuckDB is not available.
-        
+
         Args:
             gml_path: Path to input GML file
             csv_path: Path to output CSV file
             limit: Optional limit on number of records to convert
-            
+
         Returns:
             Number of records converted
         """
@@ -378,16 +420,16 @@ def convert_to_csv_duckdb(self, gml_path: Path, csv_path: Path, limit: Optional[
             print("  DuckDB not installed. Install with: pip install duckdb")
             print("  Falling back to regex-based converter...")
             return self.convert_to_csv(gml_path, csv_path, limit)
-        
+
         print(f"  Converting GML to CSV using DuckDB...")
         print(f"  Input:  {gml_path}")
         print(f"  Output: {csv_path}")
-        
+
         size_mb = gml_path.stat().st_size / (1024 * 1024)
         print(f"  GML size: {size_mb:.1f} MB")
-        
+
         csv_path.parent.mkdir(parents=True, exist_ok=True)
-        
+
         try:
             # Create DuckDB connection and load spatial extension
             con = duckdb.connect()
@@ -397,16 +439,18 @@ def convert_to_csv_duckdb(self, gml_path: Path, csv_path: Path, limit: Optional[
             except Exception as ext_err:
                 print(f"  Failed to load spatial extension: {ext_err}")
                 print("  This may be a network issue. Try running:")
-                print("    python -c \"import duckdb; duckdb.connect().execute('INSTALL spatial')\"")
+                print(
+                    "    python -c \"import duckdb; duckdb.connect().execute('INSTALL spatial')\""
+                )
                 print("  Falling back to regex-based converter...")
                 con.close()
                 return self.convert_to_csv(gml_path, csv_path, limit)
-            
+
             # Read GML file using ST_Read (GDAL-based)
             print("  Reading GML file...")
-            
+
             limit_clause = f"LIMIT {limit}" if limit else ""
-            
+
             query = f"""
                 SELECT 
                     INSPIREID as reference,
@@ -431,27 +475,29 @@ def convert_to_csv_duckdb(self, gml_path: Path, csv_path: Path, limit: Optional[
                 WHERE INSPIREID IS NOT NULL
                 {limit_clause}
             """
-            
+
             # Execute and get count first
             count_query = f"SELECT COUNT(*) FROM ST_Read('{gml_path}')"
             total_count = con.execute(count_query).fetchone()[0]
             print(f"  Found {total_count:,} cadastral parcels")
-            
+
             if limit:
                 print(f"  Limiting to {limit} records")
-            
+
             # Export directly to CSV
             print("  Transforming and writing to CSV...")
             con.execute(f"COPY ({query}) TO '{csv_path}' (HEADER, DELIMITER ',')")
-            
+
             # Count output rows
-            result_count = con.execute(f"SELECT COUNT(*) FROM read_csv('{csv_path}')").fetchone()[0]
-            
+            result_count = con.execute(
+                f"SELECT COUNT(*) FROM read_csv('{csv_path}')"
+            ).fetchone()[0]
+
             con.close()
-            
+
             print(f"  Converted {result_count:,} records to CSV")
             return result_count
-            
+
         except Exception as e:
             print(f"  DuckDB conversion failed: {e}")
             print("  Falling back to regex-based converter...")
diff --git a/local_testing/gml_extractor.py b/local_testing/gml_extractor.py
index fb004301..767ed8b1 100644
--- a/local_testing/gml_extractor.py
+++ b/local_testing/gml_extractor.py
@@ -10,41 +10,41 @@
 
 class GMLExtractor:
     """Extracts GML files from ZIP archives."""
-    
+
     @staticmethod
     def extract_gml_from_zip(zip_path: Path, output_dir: Path) -> Path:
         """
         Extract GML file from ZIP archive.
-        
+
         Args:
             zip_path: Path to ZIP file
             output_dir: Directory to extract GML file to
-            
+
         Returns:
             Path to extracted GML file
-            
+
         Raises:
             ValueError: If no GML file found in archive
         """
         output_dir.mkdir(parents=True, exist_ok=True)
-        
+
         print(f"  Extracting GML from {zip_path}")
-        
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
             # Find GML file in archive
-            gml_files = [f for f in zip_ref.namelist() if f.lower().endswith('.gml')]
-            
+            gml_files = [f for f in zip_ref.namelist() if f.lower().endswith(".gml")]
+
             if not gml_files:
                 raise ValueError(f"No GML file found in {zip_path}")
-            
+
             gml_filename = gml_files[0]
             print(f"  Found: {gml_filename}")
-            
+
             # Extract to output directory
             zip_ref.extract(gml_filename, output_dir)
-            
+
             gml_path = output_dir / gml_filename
             size_mb = gml_path.stat().st_size / (1024 * 1024)
             print(f"  Extracted: {gml_path} ({size_mb:.1f} MB)")
-            
+
             return gml_path
diff --git a/local_testing/main.py b/local_testing/main.py
index 34b5c586..151aa0dc 100644
--- a/local_testing/main.py
+++ b/local_testing/main.py
@@ -35,10 +35,10 @@
 def parse_phase_selection(phases_str: str) -> set:
     """
     Parse phase selection string into set of phase numbers.
-    
+
     Args:
         phases_str: Comma-separated phase numbers or ranges (e.g., "1,2,9" or "1-5,9")
-        
+
     Returns:
         Set of selected phase numbers, or None if invalid
     """
@@ -53,11 +53,11 @@ def parse_phase_selection(phases_str: str) -> set:
             else:
                 # Single phase: "9"
                 phases.add(int(part))
-        
+
         # Validate phase numbers (1-26)
         if any(p < 1 or p > 26 for p in phases):
             return None
-        
+
         return phases
     except (ValueError, AttributeError):
         return None
@@ -261,20 +261,20 @@ def main():
         fact_records=results["facts"],
         transform_time=results.get("transform_time", 0),
     )
-    
+
     # Run Polars pipeline for comparison if requested
     if args.compare:
         print("\n  Running Polars pipeline for comparison...")
         from polars_phases import run_polars_pipeline, PolarsPhaseMetrics
-        
+
         # Define required parameters
         field_datatype_map = {"geometry": "text"}  # Simplified for now
         intermediate_fieldnames = ["entity", "name", "geometry", "organisation"]
         factor_fieldnames = ["entity", "fact"]
-        
+
         polars_harmonised = output_dir / f"{la_name}_polars_harmonised.csv"
         polars_facts = output_dir / f"{la_name}_polars_facts.csv"
-        
+
         polars_start = time.time()
         polars_metrics, polars_harm_count, polars_fact_count = run_polars_pipeline(
             input_csv=output_path,
@@ -287,11 +287,12 @@ def main():
             selected_phases=selected_phases,  # Pass phase selection to Polars
         )
         polars_end = time.time()
-        
+
         # Store Polars metrics in report
         report.polars_phases = []
         for metric in polars_metrics:
             from pipeline_report import PhaseMetrics
+
             phase_metric = PhaseMetrics(
                 name=metric.name,
                 phase_number=metric.phase_number,
@@ -302,12 +303,16 @@ def main():
                 output_count=metric.output_count,
             )
             report.polars_phases.append(phase_metric)
-        
+
         report.polars_harmonised_records = polars_harm_count
         report.polars_fact_records = polars_fact_count
         report.polars_transform_seconds = polars_end - polars_start
-        
-        speedup = results.get("transform_time", 0) / report.polars_transform_seconds if report.polars_transform_seconds > 0 else 0
+
+        speedup = (
+            results.get("transform_time", 0) / report.polars_transform_seconds
+            if report.polars_transform_seconds > 0
+            else 0
+        )
         print(f"  Polars transform time: {report.polars_transform_seconds:.3f}s")
         print(f"  Speedup: {speedup:.1f}x faster")
 
diff --git a/local_testing/pipeline_report.py b/local_testing/pipeline_report.py
index cb4b9c26..189f1447 100644
--- a/local_testing/pipeline_report.py
+++ b/local_testing/pipeline_report.py
@@ -141,16 +141,22 @@ def to_dict(self) -> Dict:
         phases_to_output = self.phases
         polars_phases_to_output = self.polars_phases
         if self.selected_phases:
-            phases_to_output = [p for p in self.phases if p.phase_number in self.selected_phases]
-            polars_phases_to_output = [p for p in self.polars_phases if p.phase_number in self.selected_phases]
-        
+            phases_to_output = [
+                p for p in self.phases if p.phase_number in self.selected_phases
+            ]
+            polars_phases_to_output = [
+                p for p in self.polars_phases if p.phase_number in self.selected_phases
+            ]
+
         return {
             "run_id": self.run_id,
             "timestamp": self.timestamp,
             "local_authority": self.local_authority,
             "dataset": self.dataset,
             "record_limit": self.record_limit,
-            "selected_phases": list(sorted(self.selected_phases)) if self.selected_phases else None,
+            "selected_phases": (
+                list(sorted(self.selected_phases)) if self.selected_phases else None
+            ),
             "input_records": self.input_records,
             "harmonised_records": self.harmonised_records,
             "fact_records": self.fact_records,
@@ -269,7 +275,7 @@ def generate_text_report(self) -> str:
             lines.append("=" * 100)
             lines.append("PHASE-BY-PHASE COMPARISON: ORIGINAL vs POLARS")
             lines.append("=" * 100)
-            
+
             # Show phase selection info if applicable
             if self.selected_phases:
                 lines.append(f"Running selected phases: {sorted(self.selected_phases)}")
@@ -287,7 +293,9 @@ def generate_text_report(self) -> str:
             # Filter phases if selection is active
             phases_to_display = self.phases
             if self.selected_phases:
-                phases_to_display = [p for p in self.phases if p.phase_number in self.selected_phases]
+                phases_to_display = [
+                    p for p in self.phases if p.phase_number in self.selected_phases
+                ]
 
             total_original = 0.0
             total_polars = 0.0
@@ -339,12 +347,12 @@ def generate_text_report(self) -> str:
             lines.append("-" * 100)
             lines.append("ORIGINAL PIPELINE - PHASE TIMING (Row-by-Row)")
             lines.append("-" * 100)
-            
+
             # Show phase selection info if applicable
             if self.selected_phases:
                 lines.append(f"Running selected phases: {sorted(self.selected_phases)}")
                 lines.append("")
-            
+
             lines.append(
                 f"{'#':<4} {'Phase Name':<30} {'Duration':>12} {'% of Transform':>14} {'Output':>10}"
             )
@@ -353,7 +361,9 @@ def generate_text_report(self) -> str:
             # Filter phases if selection is active
             phases_to_display = self.phases
             if self.selected_phases:
-                phases_to_display = [p for p in self.phases if p.phase_number in self.selected_phases]
+                phases_to_display = [
+                    p for p in self.phases if p.phase_number in self.selected_phases
+                ]
 
             for phase in phases_to_display:
                 pct = (
@@ -375,12 +385,14 @@ def generate_text_report(self) -> str:
         lines.append("-" * 100)
         lines.append("TOP 5 SLOWEST PHASES (Original Pipeline)")
         lines.append("-" * 100)
-        
+
         # Filter phases for "top slowest" if selection is active
         phases_for_top5 = self.phases
         if self.selected_phases:
-            phases_for_top5 = [p for p in self.phases if p.phase_number in self.selected_phases]
-        
+            phases_for_top5 = [
+                p for p in self.phases if p.phase_number in self.selected_phases
+            ]
+
         sorted_phases = sorted(
             phases_for_top5, key=lambda x: x.duration_seconds, reverse=True
         )[:5]
@@ -404,7 +416,9 @@ def generate_text_report(self) -> str:
             # Filter phases for speedup calculation if selection is active
             phases_for_speedup = self.phases
             if self.selected_phases:
-                phases_for_speedup = [p for p in self.phases if p.phase_number in self.selected_phases]
+                phases_for_speedup = [
+                    p for p in self.phases if p.phase_number in self.selected_phases
+                ]
 
             polars_by_name = {p.name: p for p in self.polars_phases}
             speedups = []
@@ -465,13 +479,13 @@ def generate_text_report(self) -> str:
     def save_json(self, path: Path):
         """Save report as JSON file."""
         import json
-        
+
         path.parent.mkdir(parents=True, exist_ok=True)
-        with open(path, 'w') as f:
+        with open(path, "w") as f:
             json.dump(self.to_dict(), f, indent=2)
-    
+
     def save_text(self, path: Path):
         """Save report as text file."""
         path.parent.mkdir(parents=True, exist_ok=True)
-        with open(path, 'w') as f:
+        with open(path, "w") as f:
             f.write(self.generate_text_report())
diff --git a/local_testing/run_all.py b/local_testing/run_all.py
index 19856b00..55bd3984 100755
--- a/local_testing/run_all.py
+++ b/local_testing/run_all.py
@@ -59,9 +59,11 @@ def main():
 
     # Calculate batch metrics
     batch_duration = time.time() - batch_start
-    avg_duration = sum(t["duration"] for t in la_times) / len(la_times) if la_times else 0
+    avg_duration = (
+        sum(t["duration"] for t in la_times) / len(la_times) if la_times else 0
+    )
     successful_times = [t["duration"] for t in la_times if t["status"] == "success"]
-    
+
     # Summary
     print(f"\n{'='*60}")
     print("BATCH PROCESSING COMPLETE (with Polars Comparison)")
@@ -85,7 +87,7 @@ def main():
     reports_dir = Path(__file__).parent / "reports"
     reports_dir.mkdir(parents=True, exist_ok=True)
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    
+
     batch_report = {
         "batch_timestamp": timestamp,
         "total_las": len(endpoints),
@@ -96,13 +98,13 @@ def main():
         "polars_comparison_enabled": True,
         "limit": limit,
         "la_results": la_times,
-        "errors": errors
+        "errors": errors,
     }
-    
+
     batch_json = reports_dir / f"batch_{timestamp}_summary.json"
     with open(batch_json, "w") as f:
         json.dump(batch_report, f, indent=2)
-    
+
     print(f"\nBatch report saved: {batch_json}")
     print(f"{'='*60}\n")
 

From f6aca1f4964704175f84b980e04dbee13e128dc0 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 9 Feb 2026 14:37:22 +0000
Subject: [PATCH 13/15] =?UTF-8?q?style:=20flake8=20improve=20code=20format?=
 =?UTF-8?q?ting=20and=20readability=20in=20GML=20converter,=20main,=20pipe?=
 =?UTF-8?q?line=20report,=20pipeline=20runner,=20and=20run=5Fall=20scripts?=
 =?UTF-8?q?=20Rapid=20local=20performance=20test=20environment=20supportin?=
 =?UTF-8?q?g=20the=20Polars=E2=80=91based=20transformation=20rewrite=20in?=
 =?UTF-8?q?=20digital-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 local_testing/gml_converter.py   | 28 ++++++++++++++--------------
 local_testing/main.py            |  4 ++--
 local_testing/pipeline_report.py |  6 ++++--
 local_testing/pipeline_runner.py |  4 ++--
 local_testing/run_all.py         |  6 +++---
 5 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/local_testing/gml_converter.py b/local_testing/gml_converter.py
index 1ece02f1..c35fbba7 100644
--- a/local_testing/gml_converter.py
+++ b/local_testing/gml_converter.py
@@ -340,20 +340,20 @@ def convert_to_parquet_duckdb(
             limit_clause = f"LIMIT {limit}" if limit else ""
 
             query = f"""
-                SELECT 
+                SELECT
                     INSPIREID as reference,
                     INSPIREID as name,
                     NATIONALCADASTRALREFERENCE as "national-cadastral-reference",
                     ST_AsText(ST_Transform(geom, 'EPSG:27700', 'EPSG:4326')) as geometry,
-                    CASE 
-                        WHEN VALIDFROM IS NOT NULL 
+                    CASE
+                        WHEN VALIDFROM IS NOT NULL
                         THEN strftime(CAST(VALIDFROM AS DATE), '%Y-%m-%d')
-                        ELSE NULL 
+                        ELSE NULL
                     END as "start-date",
-                    CASE 
-                        WHEN BEGINLIFESPANVERSION IS NOT NULL 
+                    CASE
+                        WHEN BEGINLIFESPANVERSION IS NOT NULL
                         THEN strftime(CAST(BEGINLIFESPANVERSION AS DATE), '%Y-%m-%d')
-                        ELSE NULL 
+                        ELSE NULL
                     END as "entry-date",
                     NULL as "end-date",
                     'title-boundary' as prefix,
@@ -452,20 +452,20 @@ def convert_to_csv_duckdb(
             limit_clause = f"LIMIT {limit}" if limit else ""
 
             query = f"""
-                SELECT 
+                SELECT
                     INSPIREID as reference,
                     INSPIREID as name,
                     NATIONALCADASTRALREFERENCE as "national-cadastral-reference",
                     ST_AsText(ST_Transform(geom, 'EPSG:27700', 'EPSG:4326')) as geometry,
-                    CASE 
-                        WHEN VALIDFROM IS NOT NULL 
+                    CASE
+                        WHEN VALIDFROM IS NOT NULL
                         THEN strftime(CAST(VALIDFROM AS DATE), '%Y-%m-%d')
-                        ELSE NULL 
+                        ELSE NULL
                     END as "start-date",
-                    CASE 
-                        WHEN BEGINLIFESPANVERSION IS NOT NULL 
+                    CASE
+                        WHEN BEGINLIFESPANVERSION IS NOT NULL
                         THEN strftime(CAST(BEGINLIFESPANVERSION AS DATE), '%Y-%m-%d')
-                        ELSE NULL 
+                        ELSE NULL
                     END as "entry-date",
                     NULL as "end-date",
                     'title-boundary' as prefix,
diff --git a/local_testing/main.py b/local_testing/main.py
index 151aa0dc..8c99c20e 100644
--- a/local_testing/main.py
+++ b/local_testing/main.py
@@ -265,7 +265,7 @@ def main():
     # Run Polars pipeline for comparison if requested
     if args.compare:
         print("\n  Running Polars pipeline for comparison...")
-        from polars_phases import run_polars_pipeline, PolarsPhaseMetrics
+        from polars_phases import run_polars_pipeline
 
         # Define required parameters
         field_datatype_map = {"geometry": "text"}  # Simplified for now
@@ -350,7 +350,7 @@ def main():
     print(f"Fact Records: {report.fact_records:,}")
 
     if report.steps:
-        print(f"\nStep Summary:")
+        print("\nStep Summary:")
         for name, step in report.steps.items():
             status = "✓" if step.success else "✗"
             print(f"  {status} {name:<20} {step.duration_seconds:8.3f}s")
diff --git a/local_testing/pipeline_report.py b/local_testing/pipeline_report.py
index 189f1447..18f26eb8 100644
--- a/local_testing/pipeline_report.py
+++ b/local_testing/pipeline_report.py
@@ -282,9 +282,11 @@ def generate_text_report(self) -> str:
             lines.append("")
 
             # Header
-            lines.append(
-                f"{'#':<3} {'Phase Name':<26} {'Original':>11} {'Polars':>11} {'Speedup':>10} {'Time Saved':>12} {'Orig Out':>10} {'Polars Out':>10}"
+            header = (
+                f"{'#':<3} {'Phase Name':<26} {'Original':>11} {'Polars':>11} "
+                f"{'Speedup':>10} {'Time Saved':>12} {'Orig Out':>10} {'Polars Out':>10}"
             )
+            lines.append(header)
             lines.append("-" * 100)
 
             # Build lookup for Polars phases by name
diff --git a/local_testing/pipeline_runner.py b/local_testing/pipeline_runner.py
index 3f69ccfd..246737b2 100644
--- a/local_testing/pipeline_runner.py
+++ b/local_testing/pipeline_runner.py
@@ -8,7 +8,7 @@
 import time
 from pathlib import Path
 from datetime import datetime
-from typing import Dict, Optional
+from typing import Dict
 
 from pipeline_config import PipelineConfig
 
@@ -148,7 +148,7 @@ def run_full_pipeline(
 
             csv_input = input_csv.with_suffix(".csv")
             if not csv_input.exists():
-                print(f"  Converting Parquet to CSV for original pipeline...")
+                print("  Converting Parquet to CSV for original pipeline...")
                 pl.read_parquet(input_csv).write_csv(csv_input)
             input_csv = csv_input
 
diff --git a/local_testing/run_all.py b/local_testing/run_all.py
index 55bd3984..f58fb13d 100755
--- a/local_testing/run_all.py
+++ b/local_testing/run_all.py
@@ -23,7 +23,7 @@ def main():
     print("Fetching endpoint list...")
     endpoints = CLI.fetch_endpoint_list()
     print(f"Found {len(endpoints)} Local Authorities")
-    print(f"Running with Polars comparison enabled\n")
+    print("Running with Polars comparison enabled\n")
 
     success_count = 0
     error_count = 0
@@ -76,10 +76,10 @@ def main():
     if successful_times:
         print(f"  Min Time:        {min(successful_times):.1f}s")
         print(f"  Max Time:        {max(successful_times):.1f}s")
-    print(f"\n  Note: All LAs processed with both Original + Polars pipelines")
+    print("\n  Note: All LAs processed with both Original + Polars pipelines")
 
     if errors:
-        print(f"\nFailed Local Authorities:")
+        print("\nFailed Local Authorities:")
         for la in errors:
             print(f"  - {la}")
 

From bf2fe7bb8026949698b287df00239e06ae829e97 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 9 Feb 2026 14:51:02 +0000
Subject: [PATCH 14/15] =?UTF-8?q?fix:=20improve=20pipeline=20report=20form?=
 =?UTF-8?q?atting=20and=20update=20flake8=20ignore=20rules=20for=20consist?=
 =?UTF-8?q?ency=20Rapid=20local=20performance=20test=20environment=20suppo?=
 =?UTF-8?q?rting=20the=20Polars=E2=80=91based=20transformation=20rewrite?=
 =?UTF-8?q?=20in=20digital-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 local_testing/pipeline_report.py | 8 ++++++--
 setup.cfg                        | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/local_testing/pipeline_report.py b/local_testing/pipeline_report.py
index 18f26eb8..847878a4 100644
--- a/local_testing/pipeline_report.py
+++ b/local_testing/pipeline_report.py
@@ -314,9 +314,13 @@ def generate_text_report(self) -> str:
                     saved = phase.duration_seconds - polars_phase.duration_seconds
                     speedup_str = f"{speedup:.1f}x" if speedup != float("inf") else "∞"
 
-                    lines.append(
-                        f"{phase.phase_number:<3} {phase.name:<26} {phase.duration_seconds:>9.4f}s {polars_phase.duration_seconds:>9.4f}s {speedup_str:>9} {saved:>10.4f}s {phase.output_count:>10,} {polars_phase.output_count:>10,}"
+                    phase_line = (
+                        f"{phase.phase_number:<3} {phase.name:<26} "
+                        f"{phase.duration_seconds:>9.4f}s {polars_phase.duration_seconds:>9.4f}s "
+                        f"{speedup_str:>9} {saved:>10.4f}s {phase.output_count:>10,} "
+                        f"{polars_phase.output_count:>10,}"
                     )
+                    lines.append(phase_line)
 
                     total_original += phase.duration_seconds
                     total_polars += polars_phase.duration_seconds
diff --git a/setup.cfg b/setup.cfg
index 80f6adc2..2de2c1af 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [flake8]
 max-line-length = 180
-ignore = E203, W503
+ignore = E203, W503, F541, W291
 exclude = .venv,.git,__pycache__,docs/source/conf.py,old,build,dist,.direnv
 
 [pycodestyle]

From 83a5c93eb227f11d33016cedf1d9bea60d5b631d Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Tue, 17 Feb 2026 00:57:14 +0000
Subject: [PATCH 15/15] Add Polars phases for data processing pipeline

- Implemented MapPhase for renaming columns based on a mapping specification.
- Created MigratePhase to rename fields according to the latest specification.
- Added NormalisePhase to clean whitespace and handle null patterns in CSV data.
- Developed OrganisationPhase for looking up organisation values.
- Introduced PatchPhase to apply regex patches to field values.
- Implemented PivotPhase to unpivot entity rows into a series of facts.
- Created EntityPrefixPhase to ensure every entry has a prefix field.
- Added PriorityPhase to deduce the priority of each entry.
- Developed FieldPrunePhase and EntityPrunePhase to reduce columns and remove entries with missing entities.
- Implemented EntityReferencePhase and FactReferencePhase to ensure prefix and reference fields are set correctly.
- Created SavePhase to save the DataFrame to a CSV file.
- Added comprehensive tests for each phase to ensure functionality and correctness. #475
---
 digital_land/commands.py                      | 270 +++++++++-----
 digital_land/phase_polars/README.md           |  41 ---
 digital_land/phase_polars/__init__.py         |  87 +++++
 digital_land/phase_polars/combine.py          |  87 +++++
 digital_land/phase_polars/concat.py           |  90 +++++
 digital_land/phase_polars/convert.py          | 226 ++++++++++++
 digital_land/phase_polars/default.py          |  66 ++++
 digital_land/phase_polars/dump.py             |  29 ++
 digital_land/phase_polars/factor.py           |  40 +++
 digital_land/phase_polars/filter.py           |  32 ++
 digital_land/phase_polars/harmonise.py        | 229 ++++++++++++
 digital_land/phase_polars/load.py             |  39 +++
 digital_land/phase_polars/load/__init__.py    |   0
 .../phase_polars/load/save_database.py        |   0
 digital_land/phase_polars/load/save_file.py   |   0
 digital_land/phase_polars/lookup.py           | 327 +++++++++++++++++
 digital_land/phase_polars/map.py              |  93 +++++
 digital_land/phase_polars/migrate.py          |  64 ++++
 digital_land/phase_polars/normalise.py        |  84 +++++
 digital_land/phase_polars/organisation.py     |  52 +++
 digital_land/phase_polars/patch.py            |  87 +++++
 digital_land/phase_polars/phase.py            |  14 +
 digital_land/phase_polars/pivot.py            |  70 ++++
 digital_land/phase_polars/prefix.py           |  30 ++
 digital_land/phase_polars/priority.py         |  59 ++++
 digital_land/phase_polars/prune.py            |  86 +++++
 digital_land/phase_polars/reference.py        | 133 +++++++
 digital_land/phase_polars/save.py             |  45 +++
 .../phase_polars/transform/__init__.py        |   0
 .../phase_polars/transform/concat_field.py    |   0
 .../phase_polars/transform/convert.py         |   0
 .../phase_polars/transform/entity_lookup.py   |   0
 .../transform/entity_reference.py             |   0
 .../phase_polars/transform/fact_hash.py       |   0
 .../phase_polars/transform/field_prune.py     |   0
 digital_land/phase_polars/transform/filter.py |   0
 .../phase_polars/transform/flatten.py         |   0
 digital_land/phase_polars/transform/map.py    |   0
 .../phase_polars/transform/migrate.py         |   0
 .../phase_polars/transform/normalise.py       |   0
 digital_land/phase_polars/transform/parse.py  |   0
 digital_land/phase_polars/transform/patch.py  |   0
 digital_land/phase_polars/transform/pivot.py  |   0
 .../phase_polars/transform/priority.py        |   0
 .../transform/resolve_organisation.py         |   0
 .../phase_polars/transform/set_default.py     |   0
 .../phase_polars/transform/validate.py        |   0
 pyproject.toml                                |   1 +
 test_polars_phases.py                         | 328 ++++++++++++++++++
 49 files changed, 2587 insertions(+), 122 deletions(-)
 delete mode 100644 digital_land/phase_polars/README.md
 create mode 100644 digital_land/phase_polars/combine.py
 create mode 100644 digital_land/phase_polars/concat.py
 create mode 100644 digital_land/phase_polars/convert.py
 create mode 100644 digital_land/phase_polars/default.py
 create mode 100644 digital_land/phase_polars/dump.py
 create mode 100644 digital_land/phase_polars/factor.py
 create mode 100644 digital_land/phase_polars/filter.py
 create mode 100644 digital_land/phase_polars/harmonise.py
 create mode 100644 digital_land/phase_polars/load.py
 delete mode 100644 digital_land/phase_polars/load/__init__.py
 delete mode 100644 digital_land/phase_polars/load/save_database.py
 delete mode 100644 digital_land/phase_polars/load/save_file.py
 create mode 100644 digital_land/phase_polars/lookup.py
 create mode 100644 digital_land/phase_polars/map.py
 create mode 100644 digital_land/phase_polars/migrate.py
 create mode 100644 digital_land/phase_polars/normalise.py
 create mode 100644 digital_land/phase_polars/organisation.py
 create mode 100644 digital_land/phase_polars/patch.py
 create mode 100644 digital_land/phase_polars/phase.py
 create mode 100644 digital_land/phase_polars/pivot.py
 create mode 100644 digital_land/phase_polars/prefix.py
 create mode 100644 digital_land/phase_polars/priority.py
 create mode 100644 digital_land/phase_polars/prune.py
 create mode 100644 digital_land/phase_polars/reference.py
 create mode 100644 digital_land/phase_polars/save.py
 delete mode 100644 digital_land/phase_polars/transform/__init__.py
 delete mode 100644 digital_land/phase_polars/transform/concat_field.py
 delete mode 100644 digital_land/phase_polars/transform/convert.py
 delete mode 100644 digital_land/phase_polars/transform/entity_lookup.py
 delete mode 100644 digital_land/phase_polars/transform/entity_reference.py
 delete mode 100644 digital_land/phase_polars/transform/fact_hash.py
 delete mode 100644 digital_land/phase_polars/transform/field_prune.py
 delete mode 100644 digital_land/phase_polars/transform/filter.py
 delete mode 100644 digital_land/phase_polars/transform/flatten.py
 delete mode 100644 digital_land/phase_polars/transform/map.py
 delete mode 100644 digital_land/phase_polars/transform/migrate.py
 delete mode 100644 digital_land/phase_polars/transform/normalise.py
 delete mode 100644 digital_land/phase_polars/transform/parse.py
 delete mode 100644 digital_land/phase_polars/transform/patch.py
 delete mode 100644 digital_land/phase_polars/transform/pivot.py
 delete mode 100644 digital_land/phase_polars/transform/priority.py
 delete mode 100644 digital_land/phase_polars/transform/resolve_organisation.py
 delete mode 100644 digital_land/phase_polars/transform/set_default.py
 delete mode 100644 digital_land/phase_polars/transform/validate.py
 create mode 100644 test_polars_phases.py

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 463f0f45..67bb4c14 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -59,6 +59,32 @@
 from digital_land.phase.save import SavePhase
 from digital_land.pipeline import run_pipeline, Lookups, Pipeline
 from digital_land.pipeline.process import convert_tranformed_csv_to_pq
+from digital_land.phase_polars import run_polars_pipeline
+from digital_land.phase_polars import (
+    ConvertPhase as PolarsConvertPhase,
+    NormalisePhase as PolarsNormalisePhase,
+    ConcatFieldPhase as PolarsConcatFieldPhase,
+    FilterPhase as PolarsFilterPhase,
+    MapPhase as PolarsMapPhase,
+    PatchPhase as PolarsPatchPhase,
+    HarmonisePhase as PolarsHarmonisePhase,
+    DefaultPhase as PolarsDefaultPhase,
+    MigratePhase as PolarsMigratePhase,
+    OrganisationPhase as PolarsOrganisationPhase,
+    FieldPrunePhase as PolarsFieldPrunePhase,
+    EntityPrunePhase as PolarsEntityPrunePhase,
+    FactPrunePhase as PolarsFactPrunePhase,
+    EntityReferencePhase as PolarsEntityReferencePhase,
+    FactReferencePhase as PolarsFactReferencePhase,
+    EntityPrefixPhase as PolarsEntityPrefixPhase,
+    EntityLookupPhase as PolarsEntityLookupPhase,
+    FactLookupPhase as PolarsFactLookupPhase,
+    SavePhase as PolarsSavePhase,
+    PivotPhase as PolarsPivotPhase,
+    FactCombinePhase as PolarsFactCombinePhase,
+    FactorPhase as PolarsFactorPhase,
+    PriorityPhase as PolarsPriorityPhase,
+)
 from digital_land.schema import Schema
 from digital_land.update import add_source_endpoint
 from digital_land.configuration.main import Config
@@ -237,6 +263,7 @@ def pipeline_run(
     resource=None,
     output_log_dir=None,
     converted_path=None,
+    use_polars=False,
 ):
     # set up paths
     cache_dir = Path(cache_dir)
@@ -302,87 +329,168 @@ def pipeline_run(
         if "entry-date" not in default_values:
             default_values["entry-date"] = entry_date
 
-    # TODO Migrate all of this into a function in the Pipeline function
-    run_pipeline(
-        ConvertPhase(
-            path=input_path,
-            dataset_resource_log=dataset_resource_log,
-            converted_resource_log=converted_resource_log,
-            output_path=converted_path,
-        ),
-        NormalisePhase(skip_patterns=skip_patterns),
-        ParsePhase(),
-        ConcatFieldPhase(concats=concats, log=column_field_log),
-        FilterPhase(filters=pipeline.filters(resource)),
-        MapPhase(
-            fieldnames=intermediate_fieldnames,
-            columns=columns,
-            log=column_field_log,
-        ),
-        FilterPhase(filters=pipeline.filters(resource, endpoints=endpoints)),
-        PatchPhase(
-            issues=issue_log,
-            patches=patches,
-        ),
-        HarmonisePhase(
-            field_datatype_map=specification.get_field_datatype_map(),
-            issues=issue_log,
-            dataset=dataset,
-            valid_category_values=valid_category_values,
-        ),
-        DefaultPhase(
-            default_fields=default_fields,
-            default_values=default_values,
-            issues=issue_log,
-        ),
-        # TBD: move migrating columns to fields to be immediately after map
-        # this will simplify harmonisation and remove intermediate_fieldnames
-        # but effects brownfield-land and other pipelines which operate on columns
-        MigratePhase(
-            fields=specification.schema_field[schema],
-            migrations=pipeline.migrations(),
-        ),
-        OrganisationPhase(organisation=organisation, issues=issue_log),
-        FieldPrunePhase(fields=specification.current_fieldnames(schema)),
-        EntityReferencePhase(
-            dataset=dataset,
-            prefix=specification.dataset_prefix(dataset),
-            issues=issue_log,
-        ),
-        EntityPrefixPhase(dataset=dataset),
-        EntityLookupPhase(
-            lookups=lookups,
-            redirect_lookups=redirect_lookups,
-            issue_log=issue_log,
-            operational_issue_log=operational_issue_log,
-            entity_range=[entity_range_min, entity_range_max],
-        ),
-        SavePhase(
-            default_output_path("harmonised", input_path),
-            fieldnames=intermediate_fieldnames,
-            enabled=save_harmonised,
-        ),
-        EntityPrunePhase(dataset_resource_log=dataset_resource_log),
-        PriorityPhase(config=config, providers=organisations),
-        PivotPhase(),
-        FactCombinePhase(issue_log=issue_log, fields=combine_fields),
-        FactorPhase(),
-        FactReferencePhase(
-            field_typology_map=specification.get_field_typology_map(),
-            field_prefix_map=specification.get_field_prefix_map(),
-        ),
-        FactLookupPhase(
-            lookups=lookups,
-            redirect_lookups=redirect_lookups,
-            issue_log=issue_log,
-            odp_collections=specification.get_odp_collections(),
-        ),
-        FactPrunePhase(),
-        SavePhase(
-            output_path,
-            fieldnames=specification.factor_fieldnames(),
-        ),
-    )
+    if use_polars:
+        # ── Polars-based pipeline ──────────────────────────────────────────
+        run_polars_pipeline(
+            PolarsConvertPhase(
+                path=input_path,
+                dataset_resource_log=dataset_resource_log,
+                converted_resource_log=converted_resource_log,
+                output_path=converted_path,
+            ),
+            PolarsNormalisePhase(skip_patterns=skip_patterns),
+            # ParsePhase is not needed – ConvertPhase already produces a DataFrame
+            PolarsConcatFieldPhase(concats=concats, log=column_field_log),
+            PolarsFilterPhase(filters=pipeline.filters(resource)),
+            PolarsMapPhase(
+                fieldnames=intermediate_fieldnames,
+                columns=columns,
+                log=column_field_log,
+            ),
+            PolarsFilterPhase(filters=pipeline.filters(resource, endpoints=endpoints)),
+            PolarsPatchPhase(
+                issues=issue_log,
+                patches=patches,
+            ),
+            PolarsHarmonisePhase(
+                field_datatype_map=specification.get_field_datatype_map(),
+                issues=issue_log,
+                dataset=dataset,
+                valid_category_values=valid_category_values,
+            ),
+            PolarsDefaultPhase(
+                default_fields=default_fields,
+                default_values=default_values,
+                issues=issue_log,
+            ),
+            PolarsMigratePhase(
+                fields=specification.schema_field[schema],
+                migrations=pipeline.migrations(),
+            ),
+            PolarsOrganisationPhase(organisation=organisation, issues=issue_log),
+            PolarsFieldPrunePhase(fields=specification.current_fieldnames(schema)),
+            PolarsEntityReferencePhase(
+                dataset=dataset,
+                prefix=specification.dataset_prefix(dataset),
+                issues=issue_log,
+            ),
+            PolarsEntityPrefixPhase(dataset=dataset),
+            PolarsEntityLookupPhase(
+                lookups=lookups,
+                redirect_lookups=redirect_lookups,
+                issue_log=issue_log,
+                operational_issue_log=operational_issue_log,
+                entity_range=[entity_range_min, entity_range_max],
+            ),
+            PolarsSavePhase(
+                default_output_path("harmonised", input_path),
+                fieldnames=intermediate_fieldnames,
+                enabled=save_harmonised,
+            ),
+            PolarsEntityPrunePhase(dataset_resource_log=dataset_resource_log),
+            PolarsPriorityPhase(config=config, providers=organisations),
+            PolarsPivotPhase(),
+            PolarsFactCombinePhase(issue_log=issue_log, fields=combine_fields),
+            PolarsFactorPhase(),
+            PolarsFactReferencePhase(
+                field_typology_map=specification.get_field_typology_map(),
+                field_prefix_map=specification.get_field_prefix_map(),
+            ),
+            PolarsFactLookupPhase(
+                lookups=lookups,
+                redirect_lookups=redirect_lookups,
+                issue_log=issue_log,
+                odp_collections=specification.get_odp_collections(),
+            ),
+            PolarsFactPrunePhase(),
+            PolarsSavePhase(
+                output_path,
+                fieldnames=specification.factor_fieldnames(),
+            ),
+        )
+    else:
+        # ── Original streaming pipeline ────────────────────────────────────
+        # TODO Migrate all of this into a function in the Pipeline function
+        run_pipeline(
+            ConvertPhase(
+                path=input_path,
+                dataset_resource_log=dataset_resource_log,
+                converted_resource_log=converted_resource_log,
+                output_path=converted_path,
+            ),
+            NormalisePhase(skip_patterns=skip_patterns),
+            ParsePhase(),
+            ConcatFieldPhase(concats=concats, log=column_field_log),
+            FilterPhase(filters=pipeline.filters(resource)),
+            MapPhase(
+                fieldnames=intermediate_fieldnames,
+                columns=columns,
+                log=column_field_log,
+            ),
+            FilterPhase(filters=pipeline.filters(resource, endpoints=endpoints)),
+            PatchPhase(
+                issues=issue_log,
+                patches=patches,
+            ),
+            HarmonisePhase(
+                field_datatype_map=specification.get_field_datatype_map(),
+                issues=issue_log,
+                dataset=dataset,
+                valid_category_values=valid_category_values,
+            ),
+            DefaultPhase(
+                default_fields=default_fields,
+                default_values=default_values,
+                issues=issue_log,
+            ),
+            # TBD: move migrating columns to fields to be immediately after map
+            # this will simplify harmonisation and remove intermediate_fieldnames
+            # but effects brownfield-land and other pipelines which operate on columns
+            MigratePhase(
+                fields=specification.schema_field[schema],
+                migrations=pipeline.migrations(),
+            ),
+            OrganisationPhase(organisation=organisation, issues=issue_log),
+            FieldPrunePhase(fields=specification.current_fieldnames(schema)),
+            EntityReferencePhase(
+                dataset=dataset,
+                prefix=specification.dataset_prefix(dataset),
+                issues=issue_log,
+            ),
+            EntityPrefixPhase(dataset=dataset),
+            EntityLookupPhase(
+                lookups=lookups,
+                redirect_lookups=redirect_lookups,
+                issue_log=issue_log,
+                operational_issue_log=operational_issue_log,
+                entity_range=[entity_range_min, entity_range_max],
+            ),
+            SavePhase(
+                default_output_path("harmonised", input_path),
+                fieldnames=intermediate_fieldnames,
+                enabled=save_harmonised,
+            ),
+            EntityPrunePhase(dataset_resource_log=dataset_resource_log),
+            PriorityPhase(config=config, providers=organisations),
+            PivotPhase(),
+            FactCombinePhase(issue_log=issue_log, fields=combine_fields),
+            FactorPhase(),
+            FactReferencePhase(
+                field_typology_map=specification.get_field_typology_map(),
+                field_prefix_map=specification.get_field_prefix_map(),
+            ),
+            FactLookupPhase(
+                lookups=lookups,
+                redirect_lookups=redirect_lookups,
+                issue_log=issue_log,
+                odp_collections=specification.get_odp_collections(),
+            ),
+            FactPrunePhase(),
+            SavePhase(
+                output_path,
+                fieldnames=specification.factor_fieldnames(),
+            ),
+        )
 
     # In the FactCombinePhase, when combine_fields has some values, we check for duplicates and combine values.
     # If we have done this then we will not call duplicate_reference_check as we have already carried out a
diff --git a/digital_land/phase_polars/README.md b/digital_land/phase_polars/README.md
deleted file mode 100644
index 853f2fbf..00000000
--- a/digital_land/phase_polars/README.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# Phases
-
-This directory contains transformation phases used in the digital-land data pipeline. Phases are modular processing steps that transform and validate data.
-
-## Transform Phases
-
-The `transform` folder contains the core data transformation phases executed in sequence:
-
-### Data Transformation Pipeline
-
-1. **01_convert.py** - Convert data types and formats
-2. **02_normalise.py** - Normalize data values and structure
-3. **03_parse.py** - Parse and extract data from raw inputs
-4. **04_concat_field.py** - Concatenate multiple fields
-5. **05_filter.py** - Filter records based on criteria
-6. **06_map.py** - Map values between different formats
-7. **07_patch.py** - Apply patches to data records
-8. **08_validate.py** - Validate data against schema
-9. **09_set_default.py** - Set default values for missing data
-10. **10_migrate.py** - Migrate data structure/format
-11. **11_resolve_organisation.py** - Resolve and enrich organisation references
-12. **12_field_prune.py** - Remove unnecessary fields
-13. **13_entity_reference.py** - Handle entity references
-14. **14_entity_lookup.py** - Lookup and enrich entity data
-15. **15_pivot.py** - Pivot data structure
-16. **16_fact_hash.py** - Generate fact hashes for deduplication
-17. **17_flatten.py** - Flatten nested data structures
-
-## Load Phases
-
-The `load` folder contains phases for saving and storing data:
-
-1. **01_save_file.py** - Save data to file storage
-2. **02_save_database.py** - Save data to database
-
-## Overview
-
-Each phase is designed to be:
-- **Modular** - Can be used independently or in sequence
-- **Configurable** - Parameters can be customized via configuration
-- **Reusable** - Shared across different pipelines and workflows
diff --git a/digital_land/phase_polars/__init__.py b/digital_land/phase_polars/__init__.py
index e69de29b..50a7e1d5 100644
--- a/digital_land/phase_polars/__init__.py
+++ b/digital_land/phase_polars/__init__.py
@@ -0,0 +1,87 @@
+"""
+Polars-based pipeline phases.
+
+Drop-in replacements for the streaming phases in `digital_land.phase`.
+Each phase accepts and returns a `polars.DataFrame` instead of a generator.
+"""
+
+import logging
+
+import polars as pl
+
+from .phase import PolarsPhase
+from .convert import ConvertPhase
+from .normalise import NormalisePhase
+from .concat import ConcatFieldPhase
+from .filter import FilterPhase
+from .map import MapPhase
+from .patch import PatchPhase
+from .harmonise import HarmonisePhase
+from .default import DefaultPhase
+from .migrate import MigratePhase
+from .organisation import OrganisationPhase
+from .prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase
+from .reference import EntityReferencePhase, FactReferencePhase
+from .prefix import EntityPrefixPhase
+from .lookup import EntityLookupPhase, FactLookupPhase, PrintLookupPhase
+from .save import SavePhase
+from .pivot import PivotPhase
+from .combine import FactCombinePhase
+from .factor import FactorPhase
+from .priority import PriorityPhase
+from .dump import DumpPhase
+from .load import LoadPhase
+
+logger = logging.getLogger(__name__)
+
+
+def run_polars_pipeline(*phases):
+    """
+    Run a sequence of Polars phases.
+
+    Each phase receives the DataFrame output of the previous phase.
+    The first phase typically starts from ``df=None`` and creates
+    the initial DataFrame (e.g. ConvertPhase).
+    """
+    df = None
+    for phase in phases:
+        logger.debug(f"running polars phase {phase.__class__.__name__}")
+        df = phase.process(df)
+        if df is not None:
+            logger.debug(
+                f"  -> {phase.__class__.__name__} produced {df.height} rows, "
+                f"{len([c for c in df.columns if not c.startswith('__')])} data cols"
+            )
+    return df
+
+
+__all__ = [
+    "PolarsPhase",
+    "ConvertPhase",
+    "NormalisePhase",
+    "ConcatFieldPhase",
+    "FilterPhase",
+    "MapPhase",
+    "PatchPhase",
+    "HarmonisePhase",
+    "DefaultPhase",
+    "MigratePhase",
+    "OrganisationPhase",
+    "FieldPrunePhase",
+    "EntityPrunePhase",
+    "FactPrunePhase",
+    "EntityReferencePhase",
+    "FactReferencePhase",
+    "EntityPrefixPhase",
+    "EntityLookupPhase",
+    "FactLookupPhase",
+    "PrintLookupPhase",
+    "SavePhase",
+    "PivotPhase",
+    "FactCombinePhase",
+    "FactorPhase",
+    "PriorityPhase",
+    "DumpPhase",
+    "LoadPhase",
+    "run_polars_pipeline",
+]
diff --git a/digital_land/phase_polars/combine.py b/digital_land/phase_polars/combine.py
new file mode 100644
index 00000000..db4d17da
--- /dev/null
+++ b/digital_land/phase_polars/combine.py
@@ -0,0 +1,87 @@
+from copy import deepcopy
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+try:
+    from shapely.ops import unary_union
+    from shapely.geometry import MultiPolygon
+    import shapely.wkt
+    from digital_land.datatype.wkt import dump_wkt
+
+    HAS_SHAPELY = True
+except ImportError:
+    HAS_SHAPELY = False
+
+
+def combine_geometries(wkts, precision=6):
+    geometries = [shapely.wkt.loads(x) for x in wkts]
+    union = unary_union(geometries)
+    if not isinstance(union, MultiPolygon):
+        union = MultiPolygon([union])
+    return dump_wkt(union, precision=precision)
+
+
+class FactCombinePhase(PolarsPhase):
+    """
+    Combine field values from multiple facts for the same entity.
+    """
+
+    def __init__(self, issue_log=None, fields=None):
+        if fields is None:
+            fields = {}
+        self.issues = issue_log
+        self.fields = fields
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0 or not self.fields:
+            return df
+
+        if "field" not in df.columns or "entity" not in df.columns:
+            return df
+
+        combine_field_names = set(self.fields.keys()) if isinstance(self.fields, dict) else set(self.fields)
+
+        # Split into combinable and non-combinable
+        mask = pl.col("field").is_in(list(combine_field_names))
+        pass_through = df.filter(~mask)
+        to_combine = df.filter(mask)
+
+        if to_combine.height == 0:
+            return pass_through
+
+        # Group by entity + field and combine values
+        combined_rows = []
+        for (entity, field), group_df in to_combine.group_by(["entity", "field"]):
+            values = [
+                v
+                for v in group_df["value"].to_list()
+                if v is not None and v != ""
+            ]
+            values = sorted(set(values))
+
+            if field == "geometry" and HAS_SHAPELY and values:
+                combined_value = combine_geometries(values)
+            elif isinstance(self.fields, dict) and field in self.fields:
+                separator = self.fields[field]
+                combined_value = separator.join(values)
+            else:
+                combined_value = ";".join(values)
+
+            # Emit rows for each original row in the group
+            for row in group_df.iter_rows(named=True):
+                if self.issues:
+                    self.issues.line_number = row.get("line-number", row.get("__line_number", ""))
+                    self.issues.entry_number = row.get("entry-number", row.get("__entry_number", ""))
+                    self.issues.log_issue(field, "combined-value", entity)
+
+                new_row = dict(row)
+                new_row["value"] = combined_value
+                combined_rows.append(new_row)
+
+        if combined_rows:
+            combined_df = pl.DataFrame(combined_rows, schema=df.schema)
+            return pl.concat([pass_through, combined_df])
+
+        return pass_through
diff --git a/digital_land/phase_polars/concat.py b/digital_land/phase_polars/concat.py
new file mode 100644
index 00000000..111c2bd9
--- /dev/null
+++ b/digital_land/phase_polars/concat.py
@@ -0,0 +1,90 @@
+import itertools
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class ConcatFieldPhase(PolarsPhase):
+    """
+    Concatenate multiple source fields into a single destination field.
+    """
+
+    def __init__(self, concats=None, log=None):
+        if concats is None:
+            concats = {}
+        self.concats = concats
+
+        if log:
+            for fieldname, cat in self.concats.items():
+                log.add(
+                    fieldname,
+                    cat["prepend"]
+                    + cat["separator"].join(cat["fields"])
+                    + cat["append"],
+                )
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0 or not self.concats:
+            return df
+
+        for fieldname, cat in self.concats.items():
+            prepend = cat["prepend"]
+            separator = cat["separator"]
+            append = cat["append"]
+            source_fields = cat["fields"]
+
+            # Ensure the destination column exists
+            if fieldname not in df.columns:
+                df = df.with_columns(pl.lit("").alias(fieldname))
+
+            # Build list of expressions for values to concatenate
+            # Start with the existing field value, then add source fields
+            parts = [pl.col(fieldname).fill_null("")]
+            for h in source_fields:
+                if h in df.columns:
+                    parts.append(
+                        pl.when(
+                            pl.col(h).is_not_null()
+                            & (pl.col(h).str.strip_chars() != "")
+                        )
+                        .then(pl.col(h))
+                        .otherwise(pl.lit(None))
+                    )
+
+            # Filter out nulls and join with separator, then wrap with prepend/append
+            def _concat_row(row_vals):
+                filtered = [v for v in row_vals if v is not None and v != ""]
+                body = separator.join(filtered)
+                return prepend + body + append
+
+            # Use struct + map_elements for the concatenation logic
+            struct_cols = []
+            temp_names = []
+            for i, part in enumerate(parts):
+                name = f"__concat_part_{i}"
+                temp_names.append(name)
+                struct_cols.append(part.alias(name))
+
+            df = df.with_columns(struct_cols)
+
+            df = df.with_columns(
+                pl.struct(temp_names)
+                .map_elements(
+                    lambda s, sep=separator, pre=prepend, app=append: (
+                        pre
+                        + sep.join(
+                            v
+                            for v in s.values()
+                            if v is not None and str(v).strip() != ""
+                        )
+                        + app
+                    ),
+                    return_dtype=pl.Utf8,
+                )
+                .alias(fieldname)
+            )
+
+            df = df.drop(temp_names)
+
+        return df
diff --git a/digital_land/phase_polars/convert.py b/digital_land/phase_polars/convert.py
new file mode 100644
index 00000000..35380def
--- /dev/null
+++ b/digital_land/phase_polars/convert.py
@@ -0,0 +1,226 @@
+import csv
+import logging
+import os
+import tempfile
+import time
+from pathlib import Path
+
+import polars as pl
+
+from .phase import PolarsPhase
+from ..phase.convert import (
+    ConversionError,
+    convert_features_to_csv,
+    convert_json_to_csv,
+    detect_file_encoding,
+    read_csv,
+    read_excel,
+)
+from ..log import ConvertedResourceLog
+
+import sqlite3
+import zipfile
+
+logger = logging.getLogger(__name__)
+
+
+class ConvertPhase(PolarsPhase):
+    """
+    Detect and convert input file format then load into a Polars DataFrame.
+
+    Re-uses the existing format-detection and conversion helpers so the
+    behaviour is identical to the streaming ConvertPhase.
+    """
+
+    def __init__(
+        self,
+        path=None,
+        dataset_resource_log=None,
+        converted_resource_log=None,
+        output_path=None,
+    ):
+        self.path = path
+        self.dataset_resource_log = dataset_resource_log
+        self.converted_resource_log = converted_resource_log
+        self.charset = ""
+        self.output_path = output_path
+        if output_path:
+            output_dir = os.path.dirname(str(output_path))
+            if output_dir and not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+
+    def _resource_from_path(self, path):
+        return Path(path).stem
+
+    def _find_zip_file(self, input_file, suffix=".gml"):
+        zip_ = zipfile.ZipFile(input_file)
+        files = zip_.namelist()
+        files = list(
+            set(
+                filter(
+                    lambda s: s.endswith(suffix) or s.endswith(suffix.upper()), files
+                )
+            )
+        )
+        if not files or not len(files):
+            return None
+        if len(files) > 1:
+            raise ValueError("Zipfile contains more than one %s file" % suffix)
+        return "/" + files[0]
+
+    def find_internal_path(self, input_path):
+        for suffix, mime in [
+            (".shp", "x-gis/x-shapefile"),
+            (".gml", "application/gml+xml"),
+            (".tab", "x-gis/x-mapinfo-tab"),
+            (".geojson", "application/vnd.geo+json"),
+            (".json", "application/vnd.geo+json"),
+            (".kml", "application/vnd.google-earth.kml+xml"),
+        ]:
+            internal_path = self._find_zip_file(input_path, suffix)
+            if internal_path:
+                return internal_path, mime
+        return None, None
+
+    def _get_csv_path(self, input_path):
+        """Return (csv_path, should_delete_temp) by converting the input to CSV if needed."""
+
+        # Try binary formats first
+        excel = read_excel(input_path)
+        if excel is not None:
+            logger.debug(f"{input_path} looks like excel")
+            if self.dataset_resource_log:
+                self.dataset_resource_log.mime_type = "application/vnd.ms-excel"
+            tmp = self.output_path or tempfile.NamedTemporaryFile(
+                suffix=".csv", delete=False
+            ).name
+            excel.to_csv(
+                str(tmp), index=False, header=True, encoding="utf-8", quoting=csv.QUOTE_ALL
+            )
+            return str(tmp), False
+
+        if zipfile.is_zipfile(input_path):
+            logger.debug(f"{input_path} looks like zip")
+            if self.dataset_resource_log:
+                self.dataset_resource_log.mime_type = "application/zip"
+            internal_path, mime_type = self.find_internal_path(input_path)
+            if internal_path:
+                if self.dataset_resource_log:
+                    self.dataset_resource_log.internal_path = internal_path
+                    self.dataset_resource_log.internal_mime_type = mime_type
+                parent = str(self.output_path.parent) if self.output_path else None
+                tmp = tempfile.NamedTemporaryFile(suffix=".zip", dir=parent).name
+                os.link(input_path, tmp)
+                zip_path = f"/vsizip/{tmp}{internal_path}"
+                csv_path = convert_features_to_csv(zip_path, self.output_path)
+                return csv_path, False
+
+        try:
+            conn = sqlite3.connect(input_path)
+            cursor = conn.cursor()
+            cursor.execute("pragma quick_check")
+            conn.close()
+            logger.debug(f"{input_path} looks like SQLite")
+            if self.dataset_resource_log:
+                self.dataset_resource_log.mime_type = "application/geopackage+sqlite3"
+            csv_path = convert_features_to_csv(input_path, self.output_path)
+            return csv_path, False
+        except Exception:
+            pass
+
+        # Text-based formats
+        encoding = detect_file_encoding(input_path)
+        if not encoding:
+            raise ConversionError(f"Cannot detect encoding for {input_path}")
+
+        self.charset = ";charset=" + encoding
+        with open(input_path, encoding=encoding) as f:
+            content = f.read(10)
+
+        if content.lower().startswith("<!doctype "):
+            if self.dataset_resource_log:
+                self.dataset_resource_log.mime_type = "text/html" + self.charset
+            raise ConversionError(f"{input_path} is HTML")
+
+        if content.lower().startswith(("<?xml ", "<wfs:")):
+            logger.debug("%s looks like xml", input_path)
+            if self.dataset_resource_log:
+                self.dataset_resource_log.mime_type = "application/xml" + self.charset
+            csv_path = convert_features_to_csv(input_path, self.output_path)
+            if not csv_path:
+                raise ConversionError("XML to CSV conversion failed")
+            return csv_path, False
+
+        if content.lower().startswith("{") or content.lower().startswith("[{"):
+            logger.debug("%s looks like json", input_path)
+            if self.dataset_resource_log:
+                self.dataset_resource_log.mime_type = (
+                    "application/json" + self.charset
+                )
+            csv_path = convert_json_to_csv(input_path, encoding, self.output_path)
+            return csv_path, False
+
+        # plain CSV
+        if self.dataset_resource_log:
+            self.dataset_resource_log.mime_type = "text/csv" + self.charset
+        return str(input_path), False
+
+    def process(self, df=None):
+        input_path = self.path
+        resource = self._resource_from_path(input_path)
+        start_time = time.time()
+
+        try:
+            csv_path, _ = self._get_csv_path(input_path)
+
+            result = pl.read_csv(
+                csv_path,
+                infer_schema_length=0,  # read everything as strings
+                null_values=[""],
+                truncate_ragged_lines=True,
+                ignore_errors=True,
+            )
+
+            # Replace nulls with empty strings to match streaming behaviour
+            result = result.with_columns(
+                pl.all().cast(pl.Utf8).fill_null("")
+            )
+
+            n = result.height
+            result = result.with_columns(
+                pl.lit(resource).alias("__resource"),
+                pl.arange(2, n + 2).alias("__line_number"),  # 1-based, skip header
+                pl.arange(1, n + 1).alias("__entry_number"),
+                pl.lit(str(input_path)).alias("__path"),
+            )
+
+            if self.dataset_resource_log:
+                if not self.dataset_resource_log.resource:
+                    self.dataset_resource_log.resource = resource
+                self.dataset_resource_log.line_count = n + 1  # include header
+
+            if self.converted_resource_log:
+                self.converted_resource_log.add(
+                    elapsed=time.time() - start_time,
+                    status=ConvertedResourceLog.Success,
+                )
+
+            return result
+
+        except Exception as ex:
+            logger.error(f"ConvertPhase failed: {ex}")
+            if self.converted_resource_log:
+                self.converted_resource_log.add(
+                    elapsed=time.time() - start_time,
+                    status=ConvertedResourceLog.Failed,
+                    exception=str(ex),
+                )
+            # Return empty DataFrame with metadata columns
+            return pl.DataFrame(
+                {
+                    "__resource": pl.Series([], dtype=pl.Utf8),
+                    "__line_number": pl.Series([], dtype=pl.Int64),
+                    "__entry_number": pl.Series([], dtype=pl.Int64),
+                    "__path": pl.Series([], dtype=pl.Utf8),
+                }
+            )
diff --git a/digital_land/phase_polars/default.py b/digital_land/phase_polars/default.py
new file mode 100644
index 00000000..7f44e0c3
--- /dev/null
+++ b/digital_land/phase_polars/default.py
@@ -0,0 +1,66 @@
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class DefaultPhase(PolarsPhase):
+    """
+    Apply default field values and default field-to-field mappings.
+    """
+
+    def __init__(self, issues=None, default_fields=None, default_values=None):
+        if default_fields is None:
+            default_fields = {}
+        if default_values is None:
+            default_values = {}
+        self.issues = issues
+        self.default_values = default_values
+        self.default_fields = default_fields
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        # Apply default_fields: if field is empty, copy from another field
+        for field, default_field in self.default_fields.items():
+            if default_field not in df.columns:
+                continue
+            if field not in df.columns:
+                df = df.with_columns(pl.lit("").alias(field))
+
+            df = df.with_columns(
+                pl.when(
+                    pl.col(field).is_null()
+                    | (pl.col(field) == "")
+                )
+                .then(
+                    pl.when(
+                        pl.col(default_field).is_not_null()
+                        & (pl.col(default_field) != "")
+                    )
+                    .then(pl.col(default_field))
+                    .otherwise(pl.col(field))
+                )
+                .otherwise(pl.col(field))
+                .alias(field)
+            )
+
+        # Apply default_values: if field is empty, use a fixed default value
+        for field, value in self.default_values.items():
+            if not value:
+                continue
+
+            if field not in df.columns:
+                df = df.with_columns(pl.lit("").alias(field))
+
+            df = df.with_columns(
+                pl.when(
+                    pl.col(field).is_null()
+                    | (pl.col(field) == "")
+                )
+                .then(pl.lit(value))
+                .otherwise(pl.col(field))
+                .alias(field)
+            )
+
+        return df
diff --git a/digital_land/phase_polars/dump.py b/digital_land/phase_polars/dump.py
new file mode 100644
index 00000000..cd4ff3ce
--- /dev/null
+++ b/digital_land/phase_polars/dump.py
@@ -0,0 +1,29 @@
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class DumpPhase(PolarsPhase):
+    """
+    Dump raw data to a CSV file (for the ConvertPhase output).
+    """
+
+    def __init__(self, path=None, f=None, enabled=True):
+        self.path = path
+        self.f = f
+        self.enabled = enabled
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if not self.enabled or df is None or df.height == 0:
+            return df
+
+        data_cols = [c for c in df.columns if not c.startswith("__")]
+        out_df = df.select(data_cols)
+
+        if self.f:
+            csv_str = out_df.write_csv()
+            self.f.write(csv_str)
+        elif self.path:
+            out_df.write_csv(str(self.path))
+
+        return df
diff --git a/digital_land/phase_polars/factor.py b/digital_land/phase_polars/factor.py
new file mode 100644
index 00000000..7ef53295
--- /dev/null
+++ b/digital_land/phase_polars/factor.py
@@ -0,0 +1,40 @@
+import hashlib
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+def fact_hash(entity, field, value):
+    data = entity + ":" + field + ":" + value
+    return hashlib.sha256(data.encode("utf-8")).hexdigest()
+
+
+class FactorPhase(PolarsPhase):
+    """
+    Add a fact hash identifier for each fact row.
+    """
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        if not all(c in df.columns for c in ["entity", "field", "value"]):
+            return df
+
+        df = df.with_columns(
+            pl.struct(["entity", "field", "value"])
+            .map_elements(
+                lambda s: fact_hash(
+                    str(s["entity"] or ""),
+                    str(s["field"] or ""),
+                    str(s["value"] or ""),
+                )
+                if s["entity"]
+                else "",
+                return_dtype=pl.Utf8,
+            )
+            .alias("fact")
+        )
+
+        return df
diff --git a/digital_land/phase_polars/filter.py b/digital_land/phase_polars/filter.py
new file mode 100644
index 00000000..4eac4358
--- /dev/null
+++ b/digital_land/phase_polars/filter.py
@@ -0,0 +1,32 @@
+import re
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class FilterPhase(PolarsPhase):
+    """
+    Filter rows based on regex patterns applied to field values.
+    Only rows where *all* filter patterns match are kept.
+    """
+
+    def __init__(self, filters=None):
+        if filters is None:
+            filters = {}
+        self.filters = {}
+        for field, pattern in filters.items():
+            self.filters[field] = re.compile(pattern)
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0 or not self.filters:
+            return df
+
+        mask = pl.lit(True)
+        for field, pattern in self.filters.items():
+            if field in df.columns:
+                mask = mask & pl.col(field).fill_null("").str.contains(
+                    f"^(?:{pattern.pattern})"
+                )
+
+        return df.filter(mask)
diff --git a/digital_land/phase_polars/harmonise.py b/digital_land/phase_polars/harmonise.py
new file mode 100644
index 00000000..204f7e4a
--- /dev/null
+++ b/digital_land/phase_polars/harmonise.py
@@ -0,0 +1,229 @@
+import logging
+from datetime import datetime, date
+from calendar import monthrange
+
+import polars as pl
+
+from .phase import PolarsPhase
+from digital_land.datatype.point import PointDataType
+from digital_land.datatype.factory import datatype_factory
+
+try:
+    import shapely.wkt
+except ImportError:
+    shapely = None
+
+logger = logging.getLogger(__name__)
+
+MANDATORY_FIELDS_DICT = {
+    "article-4-direction": [
+        "reference", "name", "document-url", "documentation-url",
+    ],
+    "article-4-direction-area": [
+        "reference", "geometry", "name", "permitted-development-rights",
+    ],
+    "conservation-area": ["reference", "geometry", "name"],
+    "conservation-area-document": [
+        "reference", "name", "conservation-area",
+        "document-url", "documentation-url", "document-type",
+    ],
+    "tree-preservation-order": [
+        "reference", "document-url", "documentation-url",
+    ],
+    "tree-preservation-zone": ["reference", "geometry"],
+    "listed-building-outline": ["reference", "geometry", "name", "listed-building"],
+    "tree": ["reference", "point", "geometry"],
+    "brownfield-land": [
+        "OrganisationURI", "SiteReference", "SiteNameAddress", "GeoX", "GeoY",
+    ],
+}
+
+FAR_FUTURE_YEARS_AHEAD = 50
+
+
+class HarmonisePhase(PolarsPhase):
+    """
+    Harmonise field values according to their datatype specification.
+
+    This phase delegates to the existing datatype normalisation logic on a
+    per-row basis using map_elements for correctness, since individual
+    datatype classes contain complex transformation rules.
+    """
+
+    def __init__(
+        self,
+        field_datatype_map=None,
+        issues=None,
+        dataset=None,
+        valid_category_values=None,
+    ):
+        if field_datatype_map is None:
+            field_datatype_map = {}
+        if valid_category_values is None:
+            valid_category_values = {}
+        self.field_datatype_map = field_datatype_map
+        self.issues = issues
+        self.dataset = dataset
+        self.valid_category_values = valid_category_values
+
+    def _get_far_future_date(self, number_of_years_ahead: int):
+        today = date.today()
+        y = today.year + number_of_years_ahead
+        last_day = monthrange(y, today.month)[1]
+        day = min(today.day, last_day)
+        return today.replace(year=y, day=day)
+
+    def _harmonise_row(self, row_dict, resource, line_number, entry_number):
+        """Harmonise a single row – mirrors the streaming HarmonisePhase exactly."""
+        if self.issues:
+            self.issues.resource = resource
+            self.issues.line_number = line_number
+            self.issues.entry_number = entry_number
+
+        o = {}
+        for field, value in row_dict.items():
+            if field.startswith("__"):
+                continue
+
+            # Category value validation
+            if field in self.valid_category_values:
+                if value:
+                    normalised_value = value.replace(" ", "-")
+                    matching_value = next(
+                        (
+                            v
+                            for v in self.valid_category_values[field]
+                            if v.lower() == normalised_value.lower()
+                        ),
+                        None,
+                    )
+                    if matching_value:
+                        value = matching_value
+                    else:
+                        if self.issues:
+                            self.issues.log_issue(
+                                field, "invalid category value", value
+                            )
+
+            # Harmonise via datatype
+            if not value:
+                o[field] = ""
+            elif field in self.field_datatype_map:
+                if self.issues:
+                    self.issues.fieldname = field
+                datatype_name = self.field_datatype_map[field]
+                if datatype_name == "datetime":
+                    far_past_date = date(1799, 12, 31)
+                    far_future_date = self._get_far_future_date(FAR_FUTURE_YEARS_AHEAD)
+                    datatype = datatype_factory(
+                        datatype_name=datatype_name,
+                        far_past_date=far_past_date,
+                        far_future_date=far_future_date,
+                    )
+                else:
+                    datatype = datatype_factory(datatype_name=datatype_name)
+                o[field] = datatype.normalise(value, issues=self.issues)
+            else:
+                o[field] = value
+
+        # Future entry-date check
+        for field in ["entry-date", "LastUpdatedDate"]:
+            val = o.get(field, "")
+            if val:
+                try:
+                    if datetime.strptime(val[:10], "%Y-%m-%d").date() > datetime.today().date():
+                        if self.issues:
+                            self.issues.log_issue(
+                                field, "future entry-date", row_dict.get(field, ""),
+                                f"{field} must be today or in the past",
+                            )
+                        o[field] = ""
+                except (ValueError, TypeError):
+                    pass
+
+        # GeoX/GeoY handling
+        if "GeoX" in row_dict and "GeoY" in row_dict:
+            if self.issues:
+                self.issues.fieldname = "GeoX,GeoY"
+            point = PointDataType()
+            try:
+                geometry = point.normalise(
+                    [o.get("GeoX", ""), o.get("GeoY", "")],
+                    issues=self.issues,
+                )
+                if geometry and shapely:
+                    point_geometry = shapely.wkt.loads(geometry)
+                    x, y = point_geometry.coords[0]
+                    o["GeoX"] = str(x)
+                    o["GeoY"] = str(y)
+                elif not geometry:
+                    o.pop("GeoX", None)
+                    o.pop("GeoY", None)
+            except Exception as e:
+                logger.error(
+                    f"Exception occurred while fetching geoX, geoY coordinates: {e}"
+                )
+
+        # Typology prefix
+        for typology in ["organisation", "geography", "document"]:
+            value = o.get(typology, "")
+            if value and ":" not in value:
+                o[typology] = f"{self.dataset}:{value}"
+
+        # Mandatory field checks
+        mandatory_fields = MANDATORY_FIELDS_DICT.get(self.dataset)
+        for field in row_dict:
+            if field.startswith("__"):
+                continue
+            if field in ["geometry", "point"]:
+                if not row_dict.get("geometry") and not row_dict.get("point"):
+                    if self.issues:
+                        self.issues.log_issue(
+                            field, "missing value", "", f"{field} missing"
+                        )
+            elif mandatory_fields and field in mandatory_fields:
+                if not row_dict.get(field):
+                    if self.issues:
+                        self.issues.log_issue(
+                            field, "missing value", "", f"{field} missing"
+                        )
+
+        # Wikipedia
+        if row_dict.get("wikipedia", "").startswith("http"):
+            if self.issues:
+                self.issues.log_issue(
+                    "wikipedia", "removed URI prefix", row_dict["wikipedia"]
+                )
+            o["wikipedia"] = row_dict["wikipedia"].replace(
+                "https://en.wikipedia.org/wiki/", ""
+            )
+
+        return o
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        meta_cols = [c for c in df.columns if c.startswith("__")]
+        data_cols = [c for c in df.columns if not c.startswith("__")]
+
+        results = []
+        for row in df.iter_rows(named=True):
+            resource = row.get("__resource", "")
+            line_number = row.get("__line_number", 0)
+            entry_number = row.get("__entry_number", 0)
+
+            harmonised = self._harmonise_row(row, resource, line_number, entry_number)
+
+            # Include metadata
+            out = {}
+            for mc in meta_cols:
+                out[mc] = row[mc]
+            for field in data_cols:
+                out[field] = harmonised.get(field, "")
+            results.append(out)
+
+        if not results:
+            return df.clear()
+
+        return pl.DataFrame(results, schema={c: pl.Utf8 for c in results[0] if not c.startswith("__")} | {c: df.schema[c] for c in meta_cols if c in df.schema})
diff --git a/digital_land/phase_polars/load.py b/digital_land/phase_polars/load.py
new file mode 100644
index 00000000..b0571bc0
--- /dev/null
+++ b/digital_land/phase_polars/load.py
@@ -0,0 +1,39 @@
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class LoadPhase(PolarsPhase):
+    """
+    Load a CSV file into a Polars DataFrame.
+    """
+
+    def __init__(self, path=None, resource=None, dataset=None):
+        self.path = path
+        self.resource = resource
+        self.dataset = dataset
+
+    def process(self, df=None):
+        from pathlib import Path
+
+        path = self.path
+        resource = self.resource or (Path(path).stem if path else None)
+
+        result = pl.read_csv(
+            str(path),
+            infer_schema_length=0,
+            null_values=[""],
+            truncate_ragged_lines=True,
+            ignore_errors=True,
+        )
+        result = result.with_columns(pl.all().cast(pl.Utf8).fill_null(""))
+
+        n = result.height
+        result = result.with_columns(
+            pl.lit(resource or "").alias("__resource"),
+            pl.arange(2, n + 2).alias("__line_number"),
+            pl.arange(1, n + 1).alias("__entry_number"),
+            pl.lit(str(path) if path else "").alias("__path"),
+        )
+
+        return result
diff --git a/digital_land/phase_polars/load/__init__.py b/digital_land/phase_polars/load/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/load/save_database.py b/digital_land/phase_polars/load/save_database.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/load/save_file.py b/digital_land/phase_polars/load/save_file.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/lookup.py b/digital_land/phase_polars/lookup.py
new file mode 100644
index 00000000..4abed653
--- /dev/null
+++ b/digital_land/phase_polars/lookup.py
@@ -0,0 +1,327 @@
+import re
+import logging
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+normalise_pattern = re.compile(r"[^a-z0-9-]")
+
+
+def normalise(value):
+    return re.sub(normalise_pattern, "", value.lower())
+
+
+def key(entry_number="", prefix="", reference="", organisation=""):
+    entry_number = str(entry_number)
+    prefix = normalise(prefix)
+    reference = normalise(reference)
+    organisation = normalise(organisation)
+    return ",".join([entry_number, prefix, reference, organisation])
+
+
+class EntityLookupPhase(PolarsPhase):
+    """
+    Look up entity numbers by CURIE (prefix:reference).
+    """
+
+    def __init__(
+        self,
+        lookups=None,
+        redirect_lookups=None,
+        issue_log=None,
+        operational_issue_log=None,
+        entity_range=None,
+    ):
+        if lookups is None:
+            lookups = {}
+        if redirect_lookups is None:
+            redirect_lookups = {}
+        self.lookups = lookups
+        self.redirect_lookups = redirect_lookups
+        self.issues = issue_log
+        self.operational_issues = operational_issue_log
+        self.entity_range = entity_range or []
+
+    def _lookup(self, prefix="", reference="", organisation="", entry_number=""):
+        return (
+            self.lookups.get(
+                key(prefix=prefix, entry_number=entry_number), ""
+            )
+            or self.lookups.get(
+                key(prefix=prefix, organisation=organisation, reference=reference), ""
+            )
+            or self.lookups.get(
+                key(prefix=prefix, reference=reference), ""
+            )
+        )
+
+    def _redirect(self, entity):
+        if self.redirect_lookups and entity:
+            redirect_entry = self.redirect_lookups.get(str(entity), "")
+            if redirect_entry:
+                if redirect_entry["status"] == "301":
+                    return redirect_entry["entity"]
+                elif redirect_entry["status"] == "410":
+                    return ""
+        return entity
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        if "entity" not in df.columns:
+            df = df.with_columns(pl.lit("").alias("entity"))
+        if "prefix" not in df.columns:
+            df = df.with_columns(pl.lit("").alias("prefix"))
+        if "reference" not in df.columns:
+            df = df.with_columns(pl.lit("").alias("reference"))
+
+        entities = []
+        for row in df.iter_rows(named=True):
+            existing = row.get("entity", "") or ""
+            prefix = row.get("prefix", "") or ""
+            reference = row.get("reference", "") or ""
+            organisation = (row.get("organisation", "") or "").replace(
+                "local-authority-eng", "local-authority"
+            )
+            entry_number = row.get("__entry_number", "")
+            line_number = row.get("__line_number", "")
+            resource = row.get("__resource", "")
+
+            if existing:
+                entities.append(existing)
+                continue
+
+            if not prefix:
+                entities.append("")
+                continue
+
+            entity = self._lookup(
+                prefix=prefix,
+                reference=reference,
+                organisation=organisation,
+                entry_number=entry_number,
+            )
+
+            if entity and self.entity_range:
+                try:
+                    if int(entity) not in range(
+                        int(self.entity_range[0]), int(self.entity_range[1])
+                    ):
+                        if self.issues:
+                            self.issues.resource = resource
+                            self.issues.line_number = line_number
+                            self.issues.entry_number = entry_number
+                            self.issues.log_issue(
+                                "entity", "entity number out of range", entity
+                            )
+                except (ValueError, TypeError):
+                    pass
+
+            if not entity:
+                curie = f"{prefix}:{reference}"
+                if self.issues:
+                    self.issues.resource = resource
+                    self.issues.line_number = line_number
+                    self.issues.entry_number = entry_number
+                    if not reference:
+                        self.issues.log_issue(
+                            "entity",
+                            "unknown entity - missing reference",
+                            curie,
+                            line_number=line_number,
+                        )
+                    else:
+                        self.issues.log_issue(
+                            "entity",
+                            "unknown entity",
+                            curie,
+                            line_number=line_number,
+                        )
+                        if self.operational_issues:
+                            self.operational_issues.log_issue(
+                                "entity",
+                                "unknown entity",
+                                curie,
+                                line_number=line_number,
+                            )
+                entities.append("")
+            else:
+                entity = self._redirect(entity)
+                entities.append(entity)
+
+        df = df.with_columns(pl.Series("entity", entities))
+
+        # Record entity map for issue log
+        if self.issues:
+            for row in df.iter_rows(named=True):
+                entry_number = row.get("__entry_number", "")
+                entity = row.get("entity", "")
+                if entity:
+                    self.issues.record_entity_map(entry_number, entity)
+
+        return df
+
+
+class FactLookupPhase(PolarsPhase):
+    """
+    Look up reference-entity for facts.
+    """
+
+    def __init__(
+        self,
+        lookups=None,
+        redirect_lookups=None,
+        issue_log=None,
+        odp_collections=None,
+    ):
+        if lookups is None:
+            lookups = {}
+        if redirect_lookups is None:
+            redirect_lookups = {}
+        if odp_collections is None:
+            odp_collections = []
+        self.lookups = lookups
+        self.redirect_lookups = redirect_lookups
+        self.issues = issue_log
+        self.odp_collections = odp_collections
+
+    def _lookup(self, prefix="", reference="", organisation=""):
+        return (
+            self.lookups.get(
+                key(prefix=prefix, organisation=organisation, reference=reference), ""
+            )
+            or self.lookups.get(
+                key(prefix=prefix, reference=reference), ""
+            )
+        )
+
+    def _check_associated_organisation(self, entity):
+        reverse_lookups = {}
+        for k, v in self.lookups.items():
+            if v not in reverse_lookups:
+                reverse_lookups[v] = []
+            reverse_lookups[v].append(k)
+
+        if entity in reverse_lookups:
+            keywords = {"authority", "development", "government"}
+            for k in reverse_lookups[entity]:
+                parts = k.split(",")
+                if len(parts) > 3 and any(kw in parts[3] for kw in keywords):
+                    return ""
+        return entity
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        if "reference-entity" not in df.columns:
+            df = df.with_columns(pl.lit("").alias("reference-entity"))
+
+        ref_entities = []
+        for row in df.iter_rows(named=True):
+            prefix = row.get("prefix", "") or ""
+            reference = row.get("reference", "") or ""
+            entity_number = row.get("entity", "") or ""
+            line_number = row.get("line-number", "") or row.get("__line_number", "")
+            organisation = (row.get("organisation", "") or "").replace(
+                "local-authority-eng", "local-authority"
+            )
+
+            if not (prefix and reference and entity_number):
+                ref_entities.append(row.get("reference-entity", "") or "")
+                continue
+
+            find_entity = self._lookup(
+                prefix=prefix, organisation=organisation, reference=reference
+            )
+            if not find_entity:
+                find_entity = self._lookup(prefix=prefix, reference=reference)
+                find_entity = self._check_associated_organisation(find_entity)
+
+            if not find_entity or (
+                str(find_entity) in self.redirect_lookups
+                and int(self.redirect_lookups[str(find_entity)].get("status", 0)) == 410
+            ):
+                if self.odp_collections and prefix in self.odp_collections:
+                    if self.issues:
+                        self.issues.log_issue(
+                            prefix,
+                            "missing associated entity",
+                            reference,
+                            line_number=line_number,
+                        )
+                ref_entities.append("")
+            else:
+                ref_entities.append(str(find_entity))
+
+        df = df.with_columns(pl.Series("reference-entity", ref_entities))
+        return df
+
+
+class PrintLookupPhase(PolarsPhase):
+    """
+    Print new lookup entries for unresolved entities.
+    """
+
+    def __init__(self, lookups=None, redirect_lookups=None):
+        if lookups is None:
+            lookups = {}
+        if redirect_lookups is None:
+            redirect_lookups = {}
+        self.lookups = lookups
+        self.redirect_lookups = redirect_lookups
+        self.new_lookup_entries = []
+
+    def _lookup(self, prefix="", reference="", organisation="", entry_number=""):
+        return (
+            self.lookups.get(
+                key(prefix=prefix, entry_number=entry_number), ""
+            )
+            or self.lookups.get(
+                key(prefix=prefix, organisation=organisation, reference=reference), ""
+            )
+            or self.lookups.get(
+                key(prefix=prefix, reference=reference), ""
+            )
+        )
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        for row in df.iter_rows(named=True):
+            prefix = row.get("prefix", "") or ""
+            organisation = row.get("organisation", "") or ""
+            reference = row.get("reference", "") or ""
+            entry_number = row.get("__entry_number", "")
+
+            entity = ""
+            if prefix:
+                entity = self._lookup(
+                    prefix=prefix,
+                    reference=reference,
+                    organisation=organisation,
+                    entry_number=entry_number,
+                )
+
+            if not entity:
+                if prefix and organisation and reference:
+                    if "," in reference:
+                        reference = f'"{reference}"'
+                    new_lookup = {
+                        "prefix": prefix,
+                        "organisation": organisation,
+                        "reference": reference,
+                    }
+                    self.new_lookup_entries.append([new_lookup])
+                elif not reference:
+                    logging.info(
+                        "No reference found for entry: "
+                        + str(entry_number)
+                        + " in resource: "
+                        + row.get("__resource", "")
+                    )
+
+        return df
diff --git a/digital_land/phase_polars/map.py b/digital_land/phase_polars/map.py
new file mode 100644
index 00000000..0fb0cc0b
--- /dev/null
+++ b/digital_land/phase_polars/map.py
@@ -0,0 +1,93 @@
+import re
+
+import polars as pl
+
+from ..log import ColumnFieldLog
+from .phase import PolarsPhase
+
+normalise_pattern = re.compile(r"[^a-z0-9-_]")
+
+
+def normalise(name):
+    new_name = name.replace("_", "-")
+    return re.sub(normalise_pattern, "", new_name.lower())
+
+
+class MapPhase(PolarsPhase):
+    """
+    Rename columns according to the column map and specification fieldnames.
+    """
+
+    def __init__(self, fieldnames, columns=None, log=None):
+        if columns is None:
+            columns = {}
+        self.columns = columns
+        self.normalised_fieldnames = {normalise(f): f for f in fieldnames}
+        if not log:
+            log = ColumnFieldLog()
+        self.log = log
+
+    def headers(self, column_names):
+        """Build the header mapping (column_name → field_name)."""
+        headers = {}
+        matched = []
+
+        for header in sorted(column_names):
+            fieldname = normalise(header)
+            for pattern, value in self.columns.items():
+                if fieldname == pattern:
+                    matched.append(value)
+                    headers[header] = value
+
+        for header in sorted(column_names):
+            if header in headers:
+                continue
+            fieldname = normalise(header)
+            if fieldname not in matched and fieldname in self.normalised_fieldnames:
+                headers[header] = self.normalised_fieldnames[fieldname]
+
+        if {"GeoX", "Easting"} <= headers.keys():
+            item = headers.pop("GeoX")
+            headers["GeoX"] = item
+
+        if {"GeoY", "Northing"} <= headers.keys():
+            item = headers.pop("GeoY")
+            headers["GeoY"] = item
+
+        return headers
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        data_cols = [c for c in df.columns if not c.startswith("__")]
+        header_map = self.headers(data_cols)
+
+        # Log headers
+        for col, field in header_map.items():
+            self.log.add(column=col, field=field)
+
+        # Select only mapped columns (drop unmapped data cols), keep metadata
+        meta_cols = [c for c in df.columns if c.startswith("__")]
+
+        select_exprs = []
+        for col, field in header_map.items():
+            if field == "IGNORE":
+                continue
+            select_exprs.append(pl.col(col).fill_null("").alias(field))
+
+        # Add metadata columns
+        for mc in meta_cols:
+            select_exprs.append(pl.col(mc))
+
+        # Handle duplicate target field names - if multiple columns map to the same
+        # field, keep the last one (matching original generator behaviour)
+        seen = {}
+        unique_exprs = []
+        for expr in select_exprs:
+            # Get the output name from the expression
+            name = expr.meta.output_name()
+            seen[name] = expr
+        unique_exprs = list(seen.values())
+
+        return df.select(unique_exprs)
diff --git a/digital_land/phase_polars/migrate.py b/digital_land/phase_polars/migrate.py
new file mode 100644
index 00000000..20ece215
--- /dev/null
+++ b/digital_land/phase_polars/migrate.py
@@ -0,0 +1,64 @@
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class MigratePhase(PolarsPhase):
+    """
+    Rename fields to match the latest specification.
+    """
+
+    def __init__(self, fields, migrations):
+        self.migrations = migrations
+        self.fields = list(
+            set(fields + ["entity", "organisation", "prefix", "reference"])
+        )
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        meta_cols = [c for c in df.columns if c.startswith("__")]
+        data_cols = [c for c in df.columns if not c.startswith("__")]
+
+        exprs = []
+        for field in self.fields:
+            migrated_from = self.migrations.get(field)
+            if migrated_from and migrated_from in df.columns:
+                exprs.append(pl.col(migrated_from).alias(field))
+            elif field in df.columns:
+                exprs.append(pl.col(field))
+            # else: field not present in df, skip
+
+        # Handle GeoX/GeoY → point conversion
+        has_geoxy = "GeoX" in df.columns and "GeoY" in df.columns
+        if has_geoxy and "point" in self.fields:
+            exprs.append(
+                pl.when(
+                    pl.col("GeoX").is_not_null()
+                    & (pl.col("GeoX") != "")
+                    & pl.col("GeoY").is_not_null()
+                    & (pl.col("GeoY") != "")
+                )
+                .then(
+                    pl.concat_str(
+                        [pl.lit("POINT("), pl.col("GeoX"), pl.lit(" "), pl.col("GeoY"), pl.lit(")")],
+                        separator="",
+                    )
+                )
+                .otherwise(pl.lit(""))
+                .alias("point")
+            )
+
+        # Add metadata columns
+        for mc in meta_cols:
+            exprs.append(pl.col(mc))
+
+        # Deduplicate by alias (keep last in case of conflict, e.g. point)
+        seen = {}
+        for expr in exprs:
+            name = expr.meta.output_name()
+            seen[name] = expr
+        exprs = list(seen.values())
+
+        return df.select(exprs)
diff --git a/digital_land/phase_polars/normalise.py b/digital_land/phase_polars/normalise.py
new file mode 100644
index 00000000..cd03f278
--- /dev/null
+++ b/digital_land/phase_polars/normalise.py
@@ -0,0 +1,84 @@
+import csv
+import os
+import re
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+patch_dir = os.path.join(os.path.dirname(__file__), "../patch")
+
+
+class NormalisePhase(PolarsPhase):
+    """
+    Normalise CSV whitespace, strip null patterns and skip matching rows.
+
+    In the streaming pipeline this operates on raw lines *before* parsing.
+    In the Polars pipeline it operates on already-parsed string columns
+    which gives equivalent results.
+    """
+
+    null_path = os.path.join(patch_dir, "null.csv")
+
+    def __init__(self, skip_patterns=None):
+        if skip_patterns is None:
+            skip_patterns = []
+        self.skip_patterns = [re.compile(p) for p in skip_patterns]
+
+        self.null_patterns = []
+        if os.path.exists(self.null_path):
+            for row in csv.DictReader(open(self.null_path, newline="")):
+                self.null_patterns.append(re.compile(row["pattern"]))
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        # Identify data columns (non-metadata)
+        data_cols = [c for c in df.columns if not c.startswith("__")]
+
+        # Strip whitespace from all data columns
+        strip_exprs = [
+            pl.col(c)
+            .str.strip_chars()
+            .str.replace_all(r"\r", "")
+            .str.replace_all(r"\n", "\r\n")
+            .alias(c)
+            for c in data_cols
+        ]
+        if strip_exprs:
+            df = df.with_columns(strip_exprs)
+
+        # Apply null patterns to all data columns
+        for pattern in self.null_patterns:
+            null_exprs = [
+                pl.col(c).str.replace_all(pattern.pattern, "").alias(c)
+                for c in data_cols
+            ]
+            if null_exprs:
+                df = df.with_columns(null_exprs)
+
+        # Remove completely blank rows (all data columns empty or null)
+        if data_cols:
+            not_blank = pl.lit(False)
+            for c in data_cols:
+                not_blank = not_blank | (
+                    pl.col(c).is_not_null() & (pl.col(c) != "")
+                )
+            df = df.filter(not_blank)
+
+        # Skip rows matching skip patterns (matched against full comma-joined line)
+        if self.skip_patterns and data_cols:
+            concat_expr = pl.concat_str(
+                [pl.col(c).fill_null("") for c in data_cols], separator=","
+            ).alias("__skip_line")
+            df = df.with_columns(concat_expr)
+
+            for pattern in self.skip_patterns:
+                df = df.filter(
+                    ~pl.col("__skip_line").str.contains(pattern.pattern)
+                )
+
+            df = df.drop("__skip_line")
+
+        return df
diff --git a/digital_land/phase_polars/organisation.py b/digital_land/phase_polars/organisation.py
new file mode 100644
index 00000000..1c0f8cce
--- /dev/null
+++ b/digital_land/phase_polars/organisation.py
@@ -0,0 +1,52 @@
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class OrganisationPhase(PolarsPhase):
+    """
+    Look up the organisation value.
+    """
+
+    def __init__(self, organisation=None, issues=None):
+        self.organisation = organisation
+        self.issues = issues
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        if "organisation" not in df.columns:
+            df = df.with_columns(pl.lit("").alias("organisation"))
+
+        if self.organisation is None:
+            return df
+
+        # Apply organisation lookup row-by-row (lookup may be complex)
+        def _lookup(val):
+            result = self.organisation.lookup(val if val else "")
+            return result if result else ""
+
+        df = df.with_columns(
+            pl.col("organisation")
+            .map_elements(_lookup, return_dtype=pl.Utf8)
+            .alias("__org_resolved")
+        )
+
+        # Log issues for rows where organisation could not be resolved
+        if self.issues:
+            for row in df.filter(pl.col("__org_resolved") == "").iter_rows(named=True):
+                org_val = row.get("organisation", "")
+                if org_val:
+                    self.issues.resource = row.get("__resource", "")
+                    self.issues.line_number = row.get("__line_number", 0)
+                    self.issues.entry_number = row.get("__entry_number", 0)
+                    self.issues.log_issue(
+                        "organisation", "invalid organisation", org_val
+                    )
+
+        df = df.with_columns(
+            pl.col("__org_resolved").alias("organisation")
+        ).drop("__org_resolved")
+
+        return df
diff --git a/digital_land/phase_polars/patch.py b/digital_land/phase_polars/patch.py
new file mode 100644
index 00000000..fa031e36
--- /dev/null
+++ b/digital_land/phase_polars/patch.py
@@ -0,0 +1,87 @@
+import re
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class PatchPhase(PolarsPhase):
+    """
+    Apply regex patches to field values.
+    """
+
+    def __init__(self, issues=None, patches=None):
+        if patches is None:
+            patches = {}
+        self.issues = issues
+        self.patches = patches
+
+    def _apply_patch_value(self, fieldname, value):
+        """Apply patch to a single value – mirrors streaming logic exactly."""
+        patches = {**self.patches.get(fieldname, {}), **self.patches.get("", {})}
+        for pattern, replacement in patches.items():
+            original_pattern = pattern
+            if pattern == value:
+                pattern = f"^{re.escape(pattern)}$"
+            match = re.match(pattern, value, flags=re.IGNORECASE)
+            if match:
+                newvalue = match.expand(replacement)
+                if newvalue != value:
+                    if self.issues:
+                        self.issues.log_issue(fieldname, "patch", value)
+                    return newvalue
+        return value
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0 or not self.patches:
+            return df
+
+        data_cols = [c for c in df.columns if not c.startswith("__")]
+
+        # Determine which fields have patches
+        patched_fields = set(self.patches.keys()) - {""}
+        global_patches = self.patches.get("", {})
+
+        fields_to_patch = set()
+        for col in data_cols:
+            if col in patched_fields or global_patches:
+                fields_to_patch.add(col)
+
+        if not fields_to_patch:
+            return df
+
+        # Use map_elements per field for correctness (regex expand logic is complex)
+        for field in fields_to_patch:
+            if field not in df.columns:
+                continue
+
+            field_patches = {
+                **self.patches.get(field, {}),
+                **self.patches.get("", {}),
+            }
+            if not field_patches:
+                continue
+
+            def make_patcher(fname, fpatch):
+                def _patch(val):
+                    if val is None or val == "":
+                        return val
+                    for pattern, replacement in fpatch.items():
+                        p = pattern
+                        if p == val:
+                            p = f"^{re.escape(p)}$"
+                        m = re.match(p, val, flags=re.IGNORECASE)
+                        if m:
+                            newval = m.expand(replacement)
+                            return newval
+                    return val
+                return _patch
+
+            patcher = make_patcher(field, field_patches)
+            df = df.with_columns(
+                pl.col(field)
+                .map_elements(patcher, return_dtype=pl.Utf8)
+                .alias(field)
+            )
+
+        return df
diff --git a/digital_land/phase_polars/phase.py b/digital_land/phase_polars/phase.py
new file mode 100644
index 00000000..d9dbcbcb
--- /dev/null
+++ b/digital_land/phase_polars/phase.py
@@ -0,0 +1,14 @@
+import polars as pl
+
+
+class PolarsPhase:
+    """
+    A step in a Polars-based pipeline process.
+
+    Each phase takes a Polars DataFrame and returns a Polars DataFrame.
+    Metadata columns (prefixed with __) carry through the pipeline:
+      __resource, __line_number, __entry_number, __path, __dataset, __priority
+    """
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        return df
diff --git a/digital_land/phase_polars/pivot.py b/digital_land/phase_polars/pivot.py
new file mode 100644
index 00000000..4c6772c1
--- /dev/null
+++ b/digital_land/phase_polars/pivot.py
@@ -0,0 +1,70 @@
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class PivotPhase(PolarsPhase):
+    """
+    Unpivot entity rows into a series of facts (one row per field value).
+    """
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        meta_cols = [c for c in df.columns if c.startswith("__")]
+        data_cols = [c for c in df.columns if not c.startswith("__") and c != "entity"]
+
+        if "entity" not in df.columns:
+            return df
+
+        # We need to carry metadata and entity through the unpivot.
+        # Polars .unpivot() works on value columns.
+        # Build the result row-by-row for exact parity with the streaming version.
+        rows = []
+        for row in df.iter_rows(named=True):
+            entity = row.get("entity", "")
+            resource = row.get("__resource", "")
+            line_number = row.get("__line_number", 0)
+            entry_number = row.get("__entry_number", 0)
+            priority = row.get("__priority", 1)
+            entry_date = row.get("entry-date", "")
+
+            for field in sorted(data_cols):
+                value = row.get(field, "") or ""
+                rows.append(
+                    {
+                        "fact": "",
+                        "entity": entity,
+                        "field": field,
+                        "value": value,
+                        "priority": str(priority),
+                        "resource": resource,
+                        "line-number": str(line_number),
+                        "entry-number": str(entry_number),
+                        "entry-date": entry_date,
+                        "__resource": resource,
+                        "__line_number": line_number,
+                        "__entry_number": entry_number,
+                    }
+                )
+
+        if not rows:
+            return pl.DataFrame(
+                schema={
+                    "fact": pl.Utf8,
+                    "entity": pl.Utf8,
+                    "field": pl.Utf8,
+                    "value": pl.Utf8,
+                    "priority": pl.Utf8,
+                    "resource": pl.Utf8,
+                    "line-number": pl.Utf8,
+                    "entry-number": pl.Utf8,
+                    "entry-date": pl.Utf8,
+                    "__resource": pl.Utf8,
+                    "__line_number": pl.Int64,
+                    "__entry_number": pl.Int64,
+                }
+            )
+
+        return pl.DataFrame(rows)
diff --git a/digital_land/phase_polars/prefix.py b/digital_land/phase_polars/prefix.py
new file mode 100644
index 00000000..978337be
--- /dev/null
+++ b/digital_land/phase_polars/prefix.py
@@ -0,0 +1,30 @@
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class EntityPrefixPhase(PolarsPhase):
+    """
+    Ensure every entry has a prefix field.
+    """
+
+    def __init__(self, dataset=None):
+        self.dataset = dataset
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        if "prefix" not in df.columns:
+            df = df.with_columns(pl.lit(self.dataset).alias("prefix"))
+        else:
+            df = df.with_columns(
+                pl.when(
+                    pl.col("prefix").is_null() | (pl.col("prefix") == "")
+                )
+                .then(pl.lit(self.dataset))
+                .otherwise(pl.col("prefix"))
+                .alias("prefix")
+            )
+
+        return df
diff --git a/digital_land/phase_polars/priority.py b/digital_land/phase_polars/priority.py
new file mode 100644
index 00000000..0a3f9af3
--- /dev/null
+++ b/digital_land/phase_polars/priority.py
@@ -0,0 +1,59 @@
+import logging
+
+import polars as pl
+
+from .phase import PolarsPhase
+from digital_land.configuration.main import Config
+
+
+class PriorityPhase(PolarsPhase):
+    """
+    Deduce the priority of each entry when assembling facts.
+    """
+
+    def __init__(self, config: Config = None, providers=None):
+        if providers is None:
+            providers = []
+        self.providers = providers
+        self.default_priority = 1
+        self.config = config
+        if not config:
+            logging.warning(
+                f"No config provided so priority defaults to {self.default_priority}"
+            )
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        if "entity" not in df.columns:
+            df = df.with_columns(pl.lit(self.default_priority).alias("__priority"))
+            return df
+
+        if self.config:
+            priorities = []
+            organisations = []
+            for row in df.iter_rows(named=True):
+                entity = row.get("entity", "")
+                authoritative_org = self.config.get_entity_organisation(entity)
+                if authoritative_org is not None:
+                    if authoritative_org in self.providers:
+                        priorities.append(2)
+                        organisations.append(row.get("organisation", ""))
+                    else:
+                        priorities.append(self.default_priority)
+                        organisations.append(authoritative_org)
+                else:
+                    priorities.append(self.default_priority)
+                    organisations.append(row.get("organisation", ""))
+
+            df = df.with_columns(
+                pl.Series("__priority", priorities),
+                pl.Series("organisation", organisations),
+            )
+        else:
+            df = df.with_columns(
+                pl.lit(self.default_priority).alias("__priority")
+            )
+
+        return df
diff --git a/digital_land/phase_polars/prune.py b/digital_land/phase_polars/prune.py
new file mode 100644
index 00000000..6d962cbb
--- /dev/null
+++ b/digital_land/phase_polars/prune.py
@@ -0,0 +1,86 @@
+import logging
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+logger = logging.getLogger(__name__)
+
+
+class FieldPrunePhase(PolarsPhase):
+    """
+    Reduce columns to only those specified for the dataset.
+    """
+
+    def __init__(self, fields):
+        self.fields = list(
+            set(fields + ["entity", "organisation", "prefix", "reference"])
+        )
+        logging.debug(f"pruning fields to {self.fields}")
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        meta_cols = [c for c in df.columns if c.startswith("__")]
+        keep = [c for c in self.fields if c in df.columns] + meta_cols
+        return df.select(keep)
+
+
+class EntityPrunePhase(PolarsPhase):
+    """
+    Remove entries with a missing entity value.
+    """
+
+    def __init__(self, issue_log=None, dataset_resource_log=None):
+        self.log = dataset_resource_log
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            if self.log:
+                self.log.entry_count = 0
+            return df
+
+        if "entity" not in df.columns:
+            if self.log:
+                self.log.entry_count = 0
+            return df
+
+        # Log skipped rows
+        missing = df.filter(
+            pl.col("entity").is_null() | (pl.col("entity") == "")
+        )
+        for row in missing.iter_rows(named=True):
+            resource = row.get("__resource", "")
+            prefix = row.get("prefix", "")
+            reference = row.get("reference", "")
+            curie = f"{prefix}:{reference}"
+            entry_number = row.get("__entry_number", "")
+            logger.info(f"{resource} row {entry_number}: missing entity for {curie}")
+
+        result = df.filter(
+            pl.col("entity").is_not_null() & (pl.col("entity") != "")
+        )
+
+        if self.log:
+            self.log.entry_count = result.height
+
+        return result
+
+
+class FactPrunePhase(PolarsPhase):
+    """
+    Remove facts with a missing value (except when field is end-date).
+    """
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        if "value" not in df.columns:
+            return df
+
+        return df.filter(
+            (pl.col("value").is_not_null() & (pl.col("value") != ""))
+            | (pl.col("field") == "end-date")
+        )
diff --git a/digital_land/phase_polars/reference.py b/digital_land/phase_polars/reference.py
new file mode 100644
index 00000000..6e729d8b
--- /dev/null
+++ b/digital_land/phase_polars/reference.py
@@ -0,0 +1,133 @@
+import re
+import logging
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+logger = logging.getLogger(__name__)
+
+curie_re = re.compile(r"(?P<prefix>[A-Za-z0-9_-]+):(?P<reference>[A-Za-z0-9_-].*)$")
+
+
+def split_curie(value):
+    match = curie_re.match(value)
+    if not match:
+        return ("", value)
+    return (match.group("prefix"), match.group("reference"))
+
+
+class EntityReferencePhase(PolarsPhase):
+    """
+    Ensure each entry has prefix and reference fields derived from the reference column.
+    """
+
+    def __init__(self, dataset=None, prefix=None, issues=None):
+        self.dataset = dataset
+        self.prefix = prefix or dataset
+        self.issues = issues
+
+    def _process_row(self, row_dict):
+        reference_value = row_dict.get("reference", "") or row_dict.get(self.dataset, "") or ""
+        ref_prefix, reference = split_curie(reference_value)
+
+        if self.issues and ref_prefix:
+            self.issues.resource = row_dict.get("__resource", "")
+            self.issues.line_number = row_dict.get("__line_number", 0)
+            self.issues.entry_number = row_dict.get("__entry_number", 0)
+            self.issues.log_issue(
+                "reference",
+                "reference value contains reference_prefix",
+                ref_prefix,
+                f"Original reference split into prefix '{ref_prefix}' and reference '{reference}'",
+            )
+
+        if "UPRN" in ref_prefix:
+            ref_prefix = ""
+
+        prefix = row_dict.get("prefix", "") or ref_prefix or self.prefix
+        return prefix, reference
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        if "prefix" not in df.columns:
+            df = df.with_columns(pl.lit("").alias("prefix"))
+        if "reference" not in df.columns:
+            df = df.with_columns(pl.lit("").alias("reference"))
+
+        prefixes = []
+        references = []
+        for row in df.iter_rows(named=True):
+            p, r = self._process_row(row)
+            prefixes.append(p)
+            references.append(r)
+
+        df = df.with_columns(
+            pl.Series("prefix", prefixes),
+            pl.Series("reference", references),
+        )
+
+        return df
+
+
+class FactReferencePhase(PolarsPhase):
+    """
+    Ensure a fact which is a reference has prefix and reference fields.
+    """
+
+    def __init__(
+        self,
+        field_typology_map=None,
+        field_prefix_map=None,
+    ):
+        self.field_typology_map = field_typology_map or {}
+        self.field_prefix_map = field_prefix_map or {}
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if df is None or df.height == 0:
+            return df
+
+        if "prefix" not in df.columns:
+            df = df.with_columns(pl.lit("").alias("prefix"))
+        if "reference" not in df.columns:
+            df = df.with_columns(pl.lit("").alias("reference"))
+        if "field" not in df.columns or "value" not in df.columns:
+            return df
+
+        ref_typologies = {
+            "category", "document", "geography",
+            "organisation", "policy", "legal-instrument",
+        }
+
+        def _process(row_dict):
+            prefix = row_dict.get("prefix", "") or ""
+            reference = row_dict.get("reference", "") or ""
+
+            if prefix and reference:
+                return prefix, reference
+
+            field = row_dict.get("field", "")
+            typology = self.field_typology_map.get(field, "")
+
+            if typology in ref_typologies:
+                value_prefix, value_reference = split_curie(row_dict.get("value", "") or "")
+                prefix = prefix or value_prefix or self.field_prefix_map.get(field, field)
+                reference = reference or value_reference
+
+            return prefix, reference
+
+        prefixes = []
+        references = []
+        for row in df.iter_rows(named=True):
+            p, r = _process(row)
+            prefixes.append(p)
+            references.append(r)
+
+        df = df.with_columns(
+            pl.Series("prefix", prefixes),
+            pl.Series("reference", references),
+        )
+
+        return df
diff --git a/digital_land/phase_polars/save.py b/digital_land/phase_polars/save.py
new file mode 100644
index 00000000..f6322c92
--- /dev/null
+++ b/digital_land/phase_polars/save.py
@@ -0,0 +1,45 @@
+import csv
+import logging
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class SavePhase(PolarsPhase):
+    """
+    Save the DataFrame to a CSV file, then pass through.
+    """
+
+    def __init__(self, path=None, f=None, fieldnames=None, enabled=True):
+        self.path = path
+        self.f = f
+        self.fieldnames = fieldnames
+        self.enabled = enabled
+
+    def process(self, df: pl.DataFrame) -> pl.DataFrame:
+        if not self.enabled or df is None or df.height == 0:
+            return df
+
+        # Select only data columns (non-metadata)
+        data_cols = [c for c in df.columns if not c.startswith("__")]
+
+        if self.fieldnames:
+            # Only keep requested fieldnames that exist
+            keep = sorted([f for f in self.fieldnames if f in data_cols])
+        else:
+            keep = sorted(data_cols)
+
+        if not keep:
+            return df
+
+        out_df = df.select(keep)
+
+        if self.f:
+            # Write to file object
+            csv_str = out_df.write_csv()
+            self.f.write(csv_str)
+        elif self.path:
+            out_df.write_csv(str(self.path))
+
+        return df
diff --git a/digital_land/phase_polars/transform/__init__.py b/digital_land/phase_polars/transform/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/concat_field.py b/digital_land/phase_polars/transform/concat_field.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/convert.py b/digital_land/phase_polars/transform/convert.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/entity_lookup.py b/digital_land/phase_polars/transform/entity_lookup.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/entity_reference.py b/digital_land/phase_polars/transform/entity_reference.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/fact_hash.py b/digital_land/phase_polars/transform/fact_hash.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/field_prune.py b/digital_land/phase_polars/transform/field_prune.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/filter.py b/digital_land/phase_polars/transform/filter.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/flatten.py b/digital_land/phase_polars/transform/flatten.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/map.py b/digital_land/phase_polars/transform/map.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/migrate.py b/digital_land/phase_polars/transform/migrate.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/normalise.py b/digital_land/phase_polars/transform/normalise.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/parse.py b/digital_land/phase_polars/transform/parse.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/patch.py b/digital_land/phase_polars/transform/patch.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/pivot.py b/digital_land/phase_polars/transform/pivot.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/priority.py b/digital_land/phase_polars/transform/priority.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/resolve_organisation.py b/digital_land/phase_polars/transform/resolve_organisation.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/set_default.py b/digital_land/phase_polars/transform/set_default.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/validate.py b/digital_land/phase_polars/transform/validate.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/pyproject.toml b/pyproject.toml
index 19d458a4..dfeea6b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,6 +43,7 @@ dependencies = [
     "boto3",
     "moto",
     "psutil",
+    "polars",
 ]
 
 classifiers = [
diff --git a/test_polars_phases.py b/test_polars_phases.py
new file mode 100644
index 00000000..f5ff0bed
--- /dev/null
+++ b/test_polars_phases.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+"""
+Test script for Polars-based pipeline phases.
+
+Creates a simple CSV, runs each polars phase individually and in chain,
+and verifies the output matches expectations.
+"""
+
+import os
+import sys
+import tempfile
+import logging
+
+logging.basicConfig(level=logging.DEBUG, format="%(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+# ── Create test CSV ──────────────────────────────────────────────────────────
+TEST_CSV_CONTENT = """\
+reference,name,geometry,documentation-url,start-date,organisation,entry-date
+ref-001,Test Area One,MULTIPOLYGON(((-0.1 51.5,-0.1 51.6,-0.2 51.6,-0.2 51.5,-0.1 51.5))),https://example.com/doc1,2024-01-15,local-authority-eng:example,2024-01-15
+ref-002,Test Area Two,MULTIPOLYGON(((-0.3 51.5,-0.3 51.6,-0.4 51.6,-0.4 51.5,-0.3 51.5))),https://example.com/doc2,2024-02-20,local-authority-eng:example,2024-02-20
+ref-003,"  Test Area Three  ",MULTIPOLYGON(((-0.5 51.5,-0.5 51.6,-0.6 51.6,-0.6 51.5,-0.5 51.5))),https://example.com/doc3,2024-03-10,local-authority-eng:example,2024-03-10
+"""
+
+tmp_dir = tempfile.mkdtemp(prefix="polars_phases_test_")
+input_csv = os.path.join(tmp_dir, "test_input.csv")
+output_csv = os.path.join(tmp_dir, "test_output.csv")
+
+with open(input_csv, "w") as f:
+    f.write(TEST_CSV_CONTENT)
+
+print(f"Test data written to: {input_csv}")
+print(f"Output will go to: {output_csv}")
+
+# ── Import polars phases ────────────────────────────────────────────────────
+import polars as pl
+
+from digital_land.phase_polars import (
+    run_polars_pipeline,
+    ConvertPhase,
+    NormalisePhase,
+    ConcatFieldPhase,
+    FilterPhase,
+    MapPhase,
+    PatchPhase,
+    HarmonisePhase,
+    DefaultPhase,
+    MigratePhase,
+    OrganisationPhase,
+    FieldPrunePhase,
+    EntityPrunePhase,
+    FactPrunePhase,
+    EntityReferencePhase,
+    EntityPrefixPhase,
+    EntityLookupPhase,
+    FactLookupPhase,
+    SavePhase,
+    PivotPhase,
+    FactCombinePhase,
+    FactorPhase,
+    PriorityPhase,
+    DumpPhase,
+    LoadPhase,
+)
+from digital_land.log import DatasetResourceLog, ConvertedResourceLog, ColumnFieldLog, IssueLog
+
+passed = 0
+failed = 0
+
+
+def check(name, condition, detail=""):
+    global passed, failed
+    if condition:
+        print(f"  PASS: {name}")
+        passed += 1
+    else:
+        print(f"  FAIL: {name} {detail}")
+        failed += 1
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 1: ConvertPhase — loads CSV into DataFrame
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 1: ConvertPhase ──")
+dataset_resource_log = DatasetResourceLog()
+converted_resource_log = ConvertedResourceLog()
+convert_phase = ConvertPhase(
+    path=input_csv,
+    dataset_resource_log=dataset_resource_log,
+    converted_resource_log=converted_resource_log,
+)
+df = convert_phase.process()
+check("returns DataFrame", isinstance(df, pl.DataFrame))
+check("has 3 rows", df.height == 3, f"got {df.height}")
+check("has __resource column", "__resource" in df.columns)
+check("has __line_number column", "__line_number" in df.columns)
+check("has reference column", "reference" in df.columns)
+print(f"  Columns: {[c for c in df.columns if not c.startswith('__')]}")
+print(f"  Shape: {df.shape}")
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 2: NormalisePhase — strips whitespace
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 2: NormalisePhase ──")
+normalise_phase = NormalisePhase(skip_patterns=[])
+df2 = normalise_phase.process(df)
+check("preserves row count", df2.height == 3)
+# Check that whitespace was stripped from "  Test Area Three  "
+names = df2["name"].to_list()
+check("whitespace stripped", "Test Area Three" in names, f"got {names}")
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 3: MapPhase — renames columns
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 3: MapPhase ──")
+fieldnames = [
+    "reference", "name", "geometry", "documentation-url",
+    "start-date", "organisation", "entry-date", "point",
+    "entity", "prefix", "end-date",
+]
+column_field_log = ColumnFieldLog()
+map_phase = MapPhase(fieldnames=fieldnames, columns={}, log=column_field_log)
+df3 = map_phase.process(df2)
+check("preserves row count", df3.height == 3)
+check("has reference column", "reference" in df3.columns)
+data_cols = [c for c in df3.columns if not c.startswith("__")]
+print(f"  Mapped columns: {data_cols}")
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 4: PatchPhase — applies patches
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 4: PatchPhase ──")
+issue_log = IssueLog(dataset="test-dataset", resource="test-resource")
+patch_phase = PatchPhase(issues=issue_log, patches={})
+df4 = patch_phase.process(df3)
+check("no patches, same rows", df4.height == 3)
+
+# Test with actual patches
+patch_with_data = PatchPhase(
+    issues=issue_log,
+    patches={"name": {"Test Area One": "Patched Area One"}},
+)
+df4b = patch_with_data.process(df3)
+names_patched = df4b["name"].to_list()
+check("patch applied", "Patched Area One" in names_patched, f"got {names_patched}")
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 5: DefaultPhase — applies defaults
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 5: DefaultPhase ──")
+default_phase = DefaultPhase(
+    issues=issue_log,
+    default_values={"end-date": ""},
+)
+# Add an empty end-date column
+df5_in = df4.with_columns(pl.lit("").alias("end-date"))
+df5 = default_phase.process(df5_in)
+check("preserves rows", df5.height == 3)
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 6: FilterPhase
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 6: FilterPhase ──")
+filter_phase = FilterPhase(filters={})
+df6 = filter_phase.process(df5)
+check("no filter, same rows", df6.height == 3)
+
+filter_with_data = FilterPhase(filters={"reference": "ref-001"})
+df6b = filter_with_data.process(df5)
+check("filter applied", df6b.height == 1, f"got {df6b.height}")
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 7: MigratePhase
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 7: MigratePhase ──")
+migrate_phase = MigratePhase(
+    fields=["reference", "name", "geometry", "documentation-url",
+            "start-date", "organisation", "entry-date", "end-date"],
+    migrations={},
+)
+df7 = migrate_phase.process(df6)
+check("preserves rows", df7.height == 3)
+check("has reference", "reference" in df7.columns)
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 8: EntityReferencePhase + EntityPrefixPhase
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 8: EntityReferencePhase + EntityPrefixPhase ──")
+ref_phase = EntityReferencePhase(dataset="test-dataset", prefix="test-dataset", issues=issue_log)
+df8 = ref_phase.process(df7)
+check("has prefix", "prefix" in df8.columns)
+check("has reference", "reference" in df8.columns)
+prefixes = df8["prefix"].to_list()
+check("prefix set", all(p == "test-dataset" for p in prefixes), f"got {prefixes}")
+
+prefix_phase = EntityPrefixPhase(dataset="test-dataset")
+df8b = prefix_phase.process(df8)
+check("prefix still set", all(p == "test-dataset" for p in df8b["prefix"].to_list()))
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 9: FieldPrunePhase
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 9: FieldPrunePhase ──")
+prune_phase = FieldPrunePhase(fields=["reference", "name", "geometry", "organisation"])
+df9 = prune_phase.process(df8b)
+data_cols9 = [c for c in df9.columns if not c.startswith("__")]
+check("pruned to expected fields", len(data_cols9) <= 8, f"got {data_cols9}")
+check("has reference", "reference" in df9.columns)
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 10: EntityLookupPhase
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 10: EntityLookupPhase ──")
+from digital_land.phase_polars.lookup import key as lookup_key
+lookups = {
+    lookup_key(prefix="test-dataset", reference="ref-001"): "1000001",
+    lookup_key(prefix="test-dataset", reference="ref-002"): "1000002",
+    lookup_key(prefix="test-dataset", reference="ref-003"): "1000003",
+}
+lookup_phase = EntityLookupPhase(
+    lookups=lookups,
+    redirect_lookups={},
+    issue_log=issue_log,
+    entity_range=[1000000, 2000000],
+)
+df10 = lookup_phase.process(df9)
+check("has entity column", "entity" in df10.columns)
+entities = df10["entity"].to_list()
+check("entities assigned", "1000001" in entities, f"got {entities}")
+check("all entities assigned", all(e for e in entities), f"got {entities}")
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 11: EntityPrunePhase
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 11: EntityPrunePhase ──")
+dataset_resource_log2 = DatasetResourceLog(dataset="test-dataset", resource="test-resource")
+entity_prune = EntityPrunePhase(dataset_resource_log=dataset_resource_log2)
+df11 = entity_prune.process(df10)
+check("all rows kept (all have entities)", df11.height == 3)
+check("entry count logged", dataset_resource_log2.entry_count == 3)
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 12: PriorityPhase
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 12: PriorityPhase ──")
+priority_phase = PriorityPhase(config=None, providers=[])
+df12 = priority_phase.process(df11)
+check("has __priority", "__priority" in df12.columns)
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 13: PivotPhase
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 13: PivotPhase ──")
+pivot_phase = PivotPhase()
+df13 = pivot_phase.process(df12)
+check("pivoted to facts", df13.height > 3, f"got {df13.height} rows (should be > 3)")
+check("has fact column", "fact" in df13.columns)
+check("has field column", "field" in df13.columns)
+check("has value column", "value" in df13.columns)
+print(f"  Pivoted to {df13.height} fact rows from 3 entity rows")
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 14: FactorPhase
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 14: FactorPhase ──")
+factor_phase = FactorPhase()
+df14 = factor_phase.process(df13)
+facts = df14["fact"].to_list()
+non_empty_facts = [f for f in facts if f]
+check("fact hashes generated", len(non_empty_facts) > 0, f"got {len(non_empty_facts)}")
+check("fact is sha256 hex", len(non_empty_facts[0]) == 64 if non_empty_facts else False)
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 15: FactPrunePhase
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 15: FactPrunePhase ──")
+fact_prune = FactPrunePhase()
+df15 = fact_prune.process(df14)
+check("facts pruned (empty values removed)", df15.height <= df14.height)
+print(f"  Before: {df14.height} → After: {df15.height}")
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 16: SavePhase
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 16: SavePhase ──")
+save_phase = SavePhase(path=output_csv, fieldnames=["entity", "fact", "field", "value"])
+df16 = save_phase.process(df15)
+check("CSV file created", os.path.exists(output_csv))
+if os.path.exists(output_csv):
+    import csv
+    with open(output_csv) as f:
+        reader = csv.DictReader(f)
+        rows = list(reader)
+    check("CSV has rows", len(rows) > 0, f"got {len(rows)}")
+    check("CSV has entity column", "entity" in rows[0])
+    print(f"  Saved {len(rows)} rows to {output_csv}")
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST 17: run_polars_pipeline (chained execution)
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n── Test 17: run_polars_pipeline (chained) ──")
+chain_output = os.path.join(tmp_dir, "chain_output.csv")
+result_df = run_polars_pipeline(
+    ConvertPhase(path=input_csv),
+    NormalisePhase(),
+    MapPhase(fieldnames=fieldnames, columns={}),
+    FilterPhase(filters={}),
+    SavePhase(path=chain_output, enabled=True),
+)
+check("chain returns DataFrame", isinstance(result_df, pl.DataFrame))
+check("chain output file exists", os.path.exists(chain_output))
+if os.path.exists(chain_output):
+    result_check = pl.read_csv(chain_output)
+    check("chain output has 3 rows", result_check.height == 3, f"got {result_check.height}")
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# SUMMARY
+# ═══════════════════════════════════════════════════════════════════════════════
+print("\n" + "=" * 70)
+print(f"RESULTS: {passed} passed, {failed} failed out of {passed + failed} checks")
+print("=" * 70)
+
+if failed > 0:
+    print("\nSome tests FAILED!")
+    sys.exit(1)
+else:
+    print("\nAll tests PASSED!")
+    sys.exit(0)