From f248b9dc61a0739e0a506f27fbb0d93762ed0e64 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 13:45:04 +0000
Subject: [PATCH 01/15] =?UTF-8?q?fix:=20add=20venv=20and=20local=5Ftesting?=
=?UTF-8?q?=20to=20.gitignore=20Rapid=20local=20performance=20test=20envir?=
=?UTF-8?q?onment=20supporting=20the=20Polars=E2=80=91based=20transformati?=
=?UTF-8?q?on=20rewrite=20in=20digital-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.gitignore | 2 ++
1 file changed, 2 insertions(+)
diff --git a/.gitignore b/.gitignore
index a08611ff..e631b4d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,8 @@ demodata/
.eggs
*.gfs
.venv
+/venv
+/local_testing
.direnv
var/cache
/collection
From e58d438d0acf552472e3e1d2e299db39269a9764 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 22:11:32 +0000
Subject: [PATCH 02/15] =?UTF-8?q?feat:=20add=20command-line=20interface=20?=
=?UTF-8?q?for=20title-boundary=20pipeline=20with=20argument=20parsing=20R?=
=?UTF-8?q?apid=20local=20performance=20test=20environment=20supporting=20?=
=?UTF-8?q?the=20Polars=E2=80=91based=20transformation=20rewrite=20in=20di?=
=?UTF-8?q?gital-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.gitignore | 15 ++++-
local_testing/cli.py | 143 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 156 insertions(+), 2 deletions(-)
create mode 100644 local_testing/cli.py
diff --git a/.gitignore b/.gitignore
index e631b4d7..0d82846f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,7 +15,6 @@ demodata/
*.gfs
.venv
/venv
-/local_testing
.direnv
var/cache
/collection
@@ -37,4 +36,16 @@ docs/modules.rst
# don't store data folder for use as storage for notebooks
notebooks/data/
-notebooks/.ipynb_checkpoints
\ No newline at end of file
+notebooks/.ipynb_checkpoints
+
+# local_testing
+/local_testing/cache/
+/local_testing/converted/
+/local_testing/extracted/
+/local_testing/output/
+/local_testing/pipeline/
+/local_testing/polars_phases/
+/local_testing/raw/
+/local_testing/reports/
+/local_testing/specification/
+/local_testing/venv/
diff --git a/local_testing/cli.py b/local_testing/cli.py
new file mode 100644
index 00000000..4ab74687
--- /dev/null
+++ b/local_testing/cli.py
@@ -0,0 +1,143 @@
+"""
+Command-line interface for title-boundary pipeline.
+
+Handles argument parsing and provides user-facing CLI functions.
+"""
+
+import argparse
+from typing import List, Dict
+
+from file_downloader import FileDownloader
+
+
+class CLI:
+ """Command-line interface manager."""
+
+ ENDPOINT_CSV_URL = "https://raw.githubusercontent.com/digital-land/config/main/collection/title-boundary/endpoint.csv"
+
+ @staticmethod
+ def create_parser() -> argparse.ArgumentParser:
+ """
+ Create argument parser for CLI.
+
+ Returns:
+ Configured ArgumentParser instance
+ """
+ parser = argparse.ArgumentParser(
+ description="Title Boundary Pipeline - Download, Convert, and Transform",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Examples:
+ python main.py # List available LAs
+ python main.py --la "Buckinghamshire" # Process Buckinghamshire
+ python main.py --la "Buckinghamshire" --limit 100 # Limit to 100 records
+ python main.py --use-duckdb --use-parquet # Best performance
+ """,
+ )
+
+ parser.add_argument(
+ "--la", type=str, help="Local Authority name (partial match)"
+ )
+ parser.add_argument(
+ "--limit", type=int, help="Limit number of records to process"
+ )
+ parser.add_argument(
+ "--skip-download",
+ action="store_true",
+ help="Skip download, use existing data",
+ )
+ parser.add_argument(
+ "--list", action="store_true", help="List available Local Authorities"
+ )
+ parser.add_argument(
+ "--use-duckdb",
+ action="store_true",
+ help="Use DuckDB for GML conversion (faster)",
+ )
+ parser.add_argument(
+ "--use-parquet", action="store_true", help="Output Parquet instead of CSV"
+ )
+ parser.add_argument(
+ "--compare",
+ action="store_true",
+ help="Run both original and Polars pipelines for performance comparison",
+ )
+
+ return parser
+
+ @classmethod
+ def fetch_endpoint_list(cls) -> List[Dict]:
+ """
+ Fetch list of available endpoints from Land Registry API.
+
+ Returns:
+ List of endpoint dictionaries
+ """
+ return FileDownloader().fetch_endpoint_list()
+
+ @staticmethod
+ def get_la_name_from_url(url: str) -> str:
+ """
+ Extract Local Authority name from endpoint URL.
+
+ Args:
+ url: Endpoint URL
+
+ Returns:
+ Formatted LA name
+ """
+ return FileDownloader.get_la_name_from_url(url)
+
+ @classmethod
+ def list_available_las(cls):
+ """List all available Local Authorities to console."""
+ endpoints = cls.fetch_endpoint_list()
+
+ print(f"\n{'='*60}")
+ print("Available Local Authorities")
+ print(f"{'='*60}\n")
+
+ for i, ep in enumerate(endpoints, 1):
+ name = ep.get("local_authority", "Unknown")
+ print(f" {i:3d}. {name}")
+
+ print(f"\n{'='*60}")
+ print(f"Total: {len(endpoints)} Local Authorities")
+ print(f"{'='*60}\n")
+
+ return endpoints
+
+ @classmethod
+ def find_matching_la(cls, search_term: str) -> tuple:
+ """
+ Find Local Authority matching search term.
+
+ Args:
+ search_term: Partial LA name to search for
+
+ Returns:
+ Tuple of (matching_endpoint, la_name) or (None, None) if no match/multiple matches
+ """
+ endpoints = cls.fetch_endpoint_list()
+ matching = [
+ ep
+ for ep in endpoints
+ if search_term.lower() in ep.get("local_authority", "").lower()
+ ]
+
+ if not matching:
+ print(f"Error: No Local Authority matching '{search_term}'")
+ print("Use --list to see available options")
+ return None, None
+
+ if len(matching) > 1:
+ print(f"Multiple matches for '{search_term}':")
+ for ep in matching:
+ print(f" - {ep.get('local_authority', 'Unknown')}")
+ print("Please be more specific")
+ return None, None
+
+ endpoint = matching[0]
+ la_name = endpoint.get("local_authority", "Unknown")
+
+ return endpoint, la_name
From c3f82f4bac7fdf82a61ad49228fad268476b304b Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 22:11:51 +0000
Subject: [PATCH 03/15] =?UTF-8?q?feat:=20implement=20file=20downloader=20f?=
=?UTF-8?q?or=20title-boundary=20GML=20files=20with=20progress=20tracking?=
=?UTF-8?q?=20Rapid=20local=20performance=20test=20environment=20supportin?=
=?UTF-8?q?g=20the=20Polars=E2=80=91based=20transformation=20rewrite=20in?=
=?UTF-8?q?=20digital-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
local_testing/file_downloader.py | 177 +++++++++++++++++++++++++++++++
1 file changed, 177 insertions(+)
create mode 100644 local_testing/file_downloader.py
diff --git a/local_testing/file_downloader.py b/local_testing/file_downloader.py
new file mode 100644
index 00000000..e13fc40b
--- /dev/null
+++ b/local_testing/file_downloader.py
@@ -0,0 +1,177 @@
+"""
+File downloader for title-boundary GML files.
+
+Handles fetching endpoint lists from GitHub config repository
+and downloading ZIP files with progress tracking.
+"""
+
+import csv
+import urllib.request
+from pathlib import Path
+from typing import List, Optional
+
+try:
+ import requests
+ HAS_REQUESTS = True
+except ImportError:
+ HAS_REQUESTS = False
+
+
+class FileDownloader:
+ """Handles downloading title-boundary files from endpoint CSV."""
+
+ ENDPOINT_CSV_URL = "https://raw.githubusercontent.com/digital-land/config/main/collection/title-boundary/endpoint.csv"
+
+ def __init__(self, endpoint_csv_url: Optional[str] = None):
+ """Initialize downloader with optional custom endpoint CSV URL."""
+ self.endpoint_csv_url = endpoint_csv_url or self.ENDPOINT_CSV_URL
+
+ def fetch_endpoint_list(self) -> List[dict]:
+ """Fetch list of available title boundary datasets from GitHub CSV."""
+ print(f" Fetching endpoint list from {self.endpoint_csv_url}...")
+
+ req = urllib.request.Request(
+ self.endpoint_csv_url,
+ headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'}
+ )
+
+ with urllib.request.urlopen(req) as response:
+ content = response.read().decode('utf-8')
+ reader = csv.DictReader(content.splitlines())
+
+ endpoints = []
+ for row in reader:
+ url = row.get('endpoint-url', '').strip()
+ if url:
+ endpoints.append({
+ 'endpoint': row.get('endpoint', ''),
+ 'url': url,
+ 'local_authority': self.get_la_name_from_url(url),
+ 'entry_date': row.get('entry-date', ''),
+ })
+
+ print(f" Found {len(endpoints)} endpoints")
+ return endpoints
+
+ @staticmethod
+ def get_la_name_from_url(url: str) -> str:
+ """Extract Local Authority name from download URL."""
+ # URL format: .../download/Buckinghamshire_Council.zip
+ parts = url.split("/")
+ if parts:
+ filename = parts[-1].replace(".zip", "").replace("_", " ")
+ # Remove common suffixes for cleaner names
+ for suffix in [" Council", " Borough Council", " City Council", " District Council",
+ " Metropolitan Borough Council", " County Council"]:
+ if filename.endswith(suffix):
+ filename = filename[:-len(suffix)]
+ break
+ # Remove prefixes
+ for prefix in ["Borough of ", "City of ", "County of ", "Royal Borough of ",
+ "London Borough of "]:
+ if filename.startswith(prefix):
+ filename = filename[len(prefix):]
+ break
+ return filename.strip()
+ return "Unknown"
+
+ def download_file(
+ self, url: str, output_path: Path, chunk_size: int = 8192
+ ) -> Path:
+ """
+ Download file from URL to output path with progress tracking.
+
+ Args:
+ url: URL to download from
+ output_path: Path where file should be saved
+ chunk_size: Size of download chunks in bytes
+
+ Returns:
+ Path to downloaded file
+ """
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+
+ print(f" Downloading from {url}")
+ print(f" Output: {output_path}")
+
+ # Use requests library if available (better redirect/cookie handling)
+ if HAS_REQUESTS:
+ return self._download_with_requests(url, output_path, chunk_size)
+ else:
+ return self._download_with_urllib(url, output_path, chunk_size)
+
+ def _download_with_requests(self, url: str, output_path: Path, chunk_size: int) -> Path:
+ """Download using requests library (handles redirects better)."""
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ 'Accept-Language': 'en-GB,en;q=0.9',
+ }
+
+ session = requests.Session()
+ session.headers.update(headers)
+
+ response = session.get(url, stream=True, allow_redirects=True, timeout=30)
+ response.raise_for_status()
+
+ total_size = int(response.headers.get('content-length', 0))
+ downloaded = 0
+
+ with open(output_path, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=chunk_size):
+ if chunk:
+ f.write(chunk)
+ downloaded += len(chunk)
+
+ if total_size > 0:
+ progress = (downloaded / total_size) * 100
+ mb_downloaded = downloaded / (1024 * 1024)
+ mb_total = total_size / (1024 * 1024)
+ print(f"\r Progress: {progress:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)", end="", flush=True)
+
+ print() # New line after progress
+ print(f" ā Downloaded {downloaded:,} bytes")
+ return output_path
+
+ def _download_with_urllib(self, url: str, output_path: Path, chunk_size: int) -> Path:
+ """Download using urllib (fallback)."""
+
+ # Add comprehensive browser headers to mimic real browser
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+ 'Accept-Language': 'en-GB,en;q=0.9',
+ 'Accept-Encoding': 'gzip, deflate, br',
+ 'Connection': 'keep-alive',
+ 'Upgrade-Insecure-Requests': '1',
+ 'Sec-Fetch-Dest': 'document',
+ 'Sec-Fetch-Mode': 'navigate',
+ 'Sec-Fetch-Site': 'none',
+ }
+
+ req = urllib.request.Request(url, headers=headers)
+
+ with urllib.request.urlopen(req) as response:
+ total_size = int(response.headers.get("content-length", 0))
+ downloaded = 0
+
+ with open(output_path, "wb") as f:
+ while True:
+ chunk = response.read(chunk_size)
+ if not chunk:
+ break
+ f.write(chunk)
+ downloaded += len(chunk)
+
+ if total_size > 0:
+ progress = (downloaded / total_size) * 100
+ print(
+ f"\r Progress: {progress:.1f}% ({downloaded:,}/{total_size:,} bytes)",
+ end="",
+ )
+
+ print() # New line after progress
+ size_mb = output_path.stat().st_size / (1024 * 1024)
+ print(f" Downloaded: {size_mb:.1f} MB")
+
+ return output_path
From 0b2c2dcb013f03300c1cf98b580ae40ebd5713a8 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 22:12:03 +0000
Subject: [PATCH 04/15] =?UTF-8?q?feat:=20add=20GML=20converter=20with=20mu?=
=?UTF-8?q?ltiple=20output=20formats=20including=20CSV=20and=20Parquet=20R?=
=?UTF-8?q?apid=20local=20performance=20test=20environment=20supporting=20?=
=?UTF-8?q?the=20Polars=E2=80=91based=20transformation=20rewrite=20in=20di?=
=?UTF-8?q?gital-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
local_testing/gml_converter.py | 458 +++++++++++++++++++++++++++++++++
1 file changed, 458 insertions(+)
create mode 100644 local_testing/gml_converter.py
diff --git a/local_testing/gml_converter.py b/local_testing/gml_converter.py
new file mode 100644
index 00000000..4a3b4036
--- /dev/null
+++ b/local_testing/gml_converter.py
@@ -0,0 +1,458 @@
+"""
+GML converter for title-boundary datasets.
+
+Provides multiple conversion strategies:
+- Regex-based CSV conversion
+- Polars-based Parquet conversion
+- DuckDB-based conversion (fastest, with spatial transforms)
+"""
+
+import csv
+import re
+from pathlib import Path
+from typing import Optional
+
+
+class GMLConverter:
+ """Converts GML files to CSV/Parquet with multiple strategies."""
+
+ @staticmethod
+ def extract_polygon_wkt(geometry_text: str) -> str:
+ """
+ Extract polygon coordinates and convert to WKT format.
+
+ Handles both exterior rings and interior rings (holes).
+
+ Args:
+ geometry_text: GML geometry element text
+
+ Returns:
+ WKT polygon string, or empty string if no valid geometry
+ """
+ exterior_match = re.search(
+ r'.*?([^<]+).*?',
+ geometry_text, re.DOTALL
+ )
+
+ if not exterior_match:
+ return ""
+
+ exterior_coords_raw = exterior_match.group(1).strip().split()
+ exterior_coords = []
+ for i in range(0, len(exterior_coords_raw), 2):
+ if i + 1 < len(exterior_coords_raw):
+ exterior_coords.append(f"{exterior_coords_raw[i]} {exterior_coords_raw[i+1]}")
+
+ if not exterior_coords:
+ return ""
+
+ # Extract interior rings (holes)
+ interior_rings = []
+ interior_matches = re.findall(
+ r'.*?([^<]+).*?',
+ geometry_text, re.DOTALL
+ )
+
+ for interior_coords_raw in interior_matches:
+ coords = interior_coords_raw.strip().split()
+ ring_coords = []
+ for i in range(0, len(coords), 2):
+ if i + 1 < len(coords):
+ ring_coords.append(f"{coords[i]} {coords[i+1]}")
+ if ring_coords:
+ interior_rings.append(ring_coords)
+
+ exterior_wkt = f"({', '.join(exterior_coords)})"
+ if interior_rings:
+ interior_wkts = [f"({', '.join(ring)})" for ring in interior_rings]
+ return f"POLYGON({exterior_wkt}, {', '.join(interior_wkts)})"
+ return f"POLYGON({exterior_wkt})"
+
+ @staticmethod
+ def extract_field(text: str, field_name: str) -> str:
+ """
+ Extract a field value from GML text.
+
+ Args:
+ text: GML text to search
+ field_name: Field name to extract
+
+ Returns:
+ Field value, or empty string if not found
+ """
+ pattern = f'([^<]+)'
+ match = re.search(pattern, text)
+ return match.group(1) if match else ""
+
+ def convert_to_csv(self, gml_path: Path, csv_path: Path, limit: Optional[int] = None) -> int:
+ """
+ Convert GML file to CSV format using regex parsing.
+
+ This is the baseline method - slower but doesn't require external dependencies.
+
+ Args:
+ gml_path: Path to input GML file
+ csv_path: Path to output CSV file
+ limit: Optional limit on number of records to convert
+
+ Returns:
+ Number of records converted
+ """
+ print(f" Converting GML to CSV...")
+ print(f" Input: {gml_path}")
+ print(f" Output: {csv_path}")
+
+ size_mb = gml_path.stat().st_size / (1024 * 1024)
+ print(f" GML size: {size_mb:.1f} MB")
+
+ with open(gml_path, 'r', encoding='utf-8') as f:
+ content = f.read()
+
+ # Find all cadastral parcel elements
+ pattern = r']*>(.*?)'
+ matches = re.findall(pattern, content, re.DOTALL)
+ total_features = len(matches)
+ print(f" Found {total_features} cadastral parcels")
+
+ if limit:
+ print(f" Limiting to {limit} records")
+
+ fieldnames = [
+ 'reference', 'name', 'national-cadastral-reference', 'geometry',
+ 'start-date', 'entry-date', 'end-date', 'prefix', 'organisation', 'notes'
+ ]
+
+ csv_path.parent.mkdir(parents=True, exist_ok=True)
+ count = 0
+
+ with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction='ignore')
+ writer.writeheader()
+
+ for match in matches:
+ feature = {}
+
+ inspire_id = self.extract_field(match, 'INSPIREID')
+ if inspire_id:
+ feature['reference'] = inspire_id
+ feature['name'] = inspire_id
+
+ ncr = self.extract_field(match, 'NATIONALCADASTRALREFERENCE')
+ if ncr:
+ feature['national-cadastral-reference'] = ncr
+
+ valid_from = self.extract_field(match, 'VALIDFROM')
+ if valid_from:
+ feature['start-date'] = valid_from.split('T')[0] if 'T' in valid_from else valid_from
+
+ begin_lifespan = self.extract_field(match, 'BEGINLIFESPANVERSION')
+ if begin_lifespan:
+ feature['entry-date'] = begin_lifespan.split('T')[0] if 'T' in begin_lifespan else begin_lifespan
+
+ geometry_match = re.search(r'(.*?)', match, re.DOTALL)
+ if geometry_match:
+ wkt = self.extract_polygon_wkt(geometry_match.group(1))
+ if wkt:
+ feature['geometry'] = wkt
+
+ if 'reference' in feature:
+ feature['prefix'] = 'title-boundary'
+ feature['organisation'] = 'government-organisation:D2'
+ writer.writerow(feature)
+ count += 1
+
+ if count % 5000 == 0:
+ print(f" Converted {count}/{total_features} features...")
+
+ if limit and count >= limit:
+ break
+
+ print(f" Converted {count} records to CSV")
+ return count
+
+ def convert_to_parquet(self, gml_path: Path, parquet_path: Path, limit: Optional[int] = None) -> int:
+ """
+ Convert GML file to Parquet format using regex parsing + Polars.
+
+ Parquet is faster to read than CSV and preserves data types.
+ Falls back to CSV if Polars is not installed.
+
+ Args:
+ gml_path: Path to input GML file
+ parquet_path: Path to output Parquet file
+ limit: Optional limit on number of records to convert
+
+ Returns:
+ Number of records converted
+ """
+ try:
+ import polars as pl
+ except ImportError:
+ print(" Polars not installed. Install with: pip install polars")
+ print(" Falling back to CSV...")
+ csv_path = parquet_path.with_suffix('.csv')
+ return self.convert_to_csv(gml_path, csv_path, limit)
+
+ print(f" Converting GML to Parquet...")
+ print(f" Input: {gml_path}")
+ print(f" Output: {parquet_path}")
+
+ size_mb = gml_path.stat().st_size / (1024 * 1024)
+ print(f" GML size: {size_mb:.1f} MB")
+
+ with open(gml_path, 'r', encoding='utf-8') as f:
+ content = f.read()
+
+ # Find all cadastral parcel elements
+ pattern = r']*>(.*?)'
+ matches = re.findall(pattern, content, re.DOTALL)
+ total_features = len(matches)
+ print(f" Found {total_features} cadastral parcels")
+
+ if limit:
+ print(f" Limiting to {limit} records")
+ matches = matches[:limit]
+
+ # Build list of records
+ records = []
+ for match in matches:
+ feature = {}
+
+ inspire_id = self.extract_field(match, 'INSPIREID')
+ if inspire_id:
+ feature['reference'] = inspire_id
+ feature['name'] = inspire_id
+
+ ncr = self.extract_field(match, 'NATIONALCADASTRALREFERENCE')
+ if ncr:
+ feature['national-cadastral-reference'] = ncr
+
+ valid_from = self.extract_field(match, 'VALIDFROM')
+ if valid_from:
+ feature['start-date'] = valid_from.split('T')[0] if 'T' in valid_from else valid_from
+
+ begin_lifespan = self.extract_field(match, 'BEGINLIFESPANVERSION')
+ if begin_lifespan:
+ feature['entry-date'] = begin_lifespan.split('T')[0] if 'T' in begin_lifespan else begin_lifespan
+
+ geometry_match = re.search(r'(.*?)', match, re.DOTALL)
+ if geometry_match:
+ wkt = self.extract_polygon_wkt(geometry_match.group(1))
+ if wkt:
+ feature['geometry'] = wkt
+
+ if 'reference' in feature:
+ feature['prefix'] = 'title-boundary'
+ feature['organisation'] = 'government-organisation:D2'
+ feature['end-date'] = None
+ feature['notes'] = None
+ records.append(feature)
+
+ # Create DataFrame and write to Parquet
+ parquet_path.parent.mkdir(parents=True, exist_ok=True)
+
+ df = pl.DataFrame(records)
+ df.write_parquet(parquet_path, compression='snappy')
+
+ count = len(records)
+ print(f" Converted {count} records to Parquet")
+ return count
+
+ def convert_to_parquet_duckdb(self, gml_path: Path, parquet_path: Path, limit: Optional[int] = None) -> int:
+ """
+ Convert GML file to Parquet format using DuckDB with spatial extension.
+
+ This is the fastest method - DuckDB reads GML directly and writes Parquet.
+ Falls back to Polars-based converter if DuckDB is not available.
+
+ Args:
+ gml_path: Path to input GML file
+ parquet_path: Path to output Parquet file
+ limit: Optional limit on number of records to convert
+
+ Returns:
+ Number of records converted
+ """
+ try:
+ import duckdb
+ except ImportError:
+ print(" DuckDB not installed. Install with: pip install duckdb")
+ print(" Falling back to Polars-based converter...")
+ return self.convert_to_parquet(gml_path, parquet_path, limit)
+
+ print(f" Converting GML to Parquet using DuckDB...")
+ print(f" Input: {gml_path}")
+ print(f" Output: {parquet_path}")
+
+ size_mb = gml_path.stat().st_size / (1024 * 1024)
+ print(f" GML size: {size_mb:.1f} MB")
+
+ parquet_path.parent.mkdir(parents=True, exist_ok=True)
+
+ try:
+ con = duckdb.connect()
+ try:
+ con.execute("INSTALL spatial; LOAD spatial;")
+ print(" Loaded DuckDB spatial extension")
+ except Exception as ext_err:
+ print(f" Failed to load spatial extension: {ext_err}")
+ print(" Falling back to Polars-based converter...")
+ con.close()
+ return self.convert_to_parquet(gml_path, parquet_path, limit)
+
+ print(" Reading GML file...")
+ limit_clause = f"LIMIT {limit}" if limit else ""
+
+ query = f"""
+ SELECT
+ INSPIREID as reference,
+ INSPIREID as name,
+ NATIONALCADASTRALREFERENCE as "national-cadastral-reference",
+ ST_AsText(ST_Transform(geom, 'EPSG:27700', 'EPSG:4326')) as geometry,
+ CASE
+ WHEN VALIDFROM IS NOT NULL
+ THEN strftime(CAST(VALIDFROM AS DATE), '%Y-%m-%d')
+ ELSE NULL
+ END as "start-date",
+ CASE
+ WHEN BEGINLIFESPANVERSION IS NOT NULL
+ THEN strftime(CAST(BEGINLIFESPANVERSION AS DATE), '%Y-%m-%d')
+ ELSE NULL
+ END as "entry-date",
+ NULL as "end-date",
+ 'title-boundary' as prefix,
+ 'government-organisation:D2' as organisation,
+ NULL as notes
+ FROM ST_Read('{gml_path}')
+ WHERE INSPIREID IS NOT NULL
+ {limit_clause}
+ """
+
+ count_query = f"SELECT COUNT(*) FROM ST_Read('{gml_path}')"
+ total_count = con.execute(count_query).fetchone()[0]
+ print(f" Found {total_count:,} cadastral parcels")
+
+ if limit:
+ print(f" Limiting to {limit} records")
+
+ # Export directly to Parquet (much faster than CSV)
+ print(" Transforming and writing to Parquet...")
+ con.execute(f"COPY ({query}) TO '{parquet_path}' (FORMAT PARQUET, COMPRESSION 'snappy')")
+
+ # Count output rows
+ result_count = con.execute(f"SELECT COUNT(*) FROM read_parquet('{parquet_path}')").fetchone()[0]
+
+ con.close()
+
+ print(f" Converted {result_count:,} records to Parquet")
+ return result_count
+
+ except Exception as e:
+ print(f" DuckDB conversion failed: {e}")
+ print(" Falling back to Polars-based converter...")
+ return self.convert_to_parquet(gml_path, parquet_path, limit)
+
+ def convert_to_csv_duckdb(self, gml_path: Path, csv_path: Path, limit: Optional[int] = None) -> int:
+ """
+ Convert GML file to CSV format using DuckDB with spatial extension.
+
+ This is significantly faster than regex parsing and properly handles:
+ - Coordinate transformations (OSGB EPSG:27700 to WGS84 EPSG:4326)
+ - Complex geometries (multi-polygons, holes)
+ - Large files with streaming
+
+ Note: For even better performance, use convert_to_parquet_duckdb() instead.
+ Falls back to regex-based converter if DuckDB is not available.
+
+ Args:
+ gml_path: Path to input GML file
+ csv_path: Path to output CSV file
+ limit: Optional limit on number of records to convert
+
+ Returns:
+ Number of records converted
+ """
+ try:
+ import duckdb
+ except ImportError:
+ print(" DuckDB not installed. Install with: pip install duckdb")
+ print(" Falling back to regex-based converter...")
+ return self.convert_to_csv(gml_path, csv_path, limit)
+
+ print(f" Converting GML to CSV using DuckDB...")
+ print(f" Input: {gml_path}")
+ print(f" Output: {csv_path}")
+
+ size_mb = gml_path.stat().st_size / (1024 * 1024)
+ print(f" GML size: {size_mb:.1f} MB")
+
+ csv_path.parent.mkdir(parents=True, exist_ok=True)
+
+ try:
+ # Create DuckDB connection and load spatial extension
+ con = duckdb.connect()
+ try:
+ con.execute("INSTALL spatial; LOAD spatial;")
+ print(" Loaded DuckDB spatial extension")
+ except Exception as ext_err:
+ print(f" Failed to load spatial extension: {ext_err}")
+ print(" This may be a network issue. Try running:")
+ print(" python -c \"import duckdb; duckdb.connect().execute('INSTALL spatial')\"")
+ print(" Falling back to regex-based converter...")
+ con.close()
+ return self.convert_to_csv(gml_path, csv_path, limit)
+
+ # Read GML file using ST_Read (GDAL-based)
+ print(" Reading GML file...")
+
+ limit_clause = f"LIMIT {limit}" if limit else ""
+
+ query = f"""
+ SELECT
+ INSPIREID as reference,
+ INSPIREID as name,
+ NATIONALCADASTRALREFERENCE as "national-cadastral-reference",
+ ST_AsText(ST_Transform(geom, 'EPSG:27700', 'EPSG:4326')) as geometry,
+ CASE
+ WHEN VALIDFROM IS NOT NULL
+ THEN strftime(CAST(VALIDFROM AS DATE), '%Y-%m-%d')
+ ELSE NULL
+ END as "start-date",
+ CASE
+ WHEN BEGINLIFESPANVERSION IS NOT NULL
+ THEN strftime(CAST(BEGINLIFESPANVERSION AS DATE), '%Y-%m-%d')
+ ELSE NULL
+ END as "entry-date",
+ NULL as "end-date",
+ 'title-boundary' as prefix,
+ 'government-organisation:D2' as organisation,
+ NULL as notes
+ FROM ST_Read('{gml_path}')
+ WHERE INSPIREID IS NOT NULL
+ {limit_clause}
+ """
+
+ # Execute and get count first
+ count_query = f"SELECT COUNT(*) FROM ST_Read('{gml_path}')"
+ total_count = con.execute(count_query).fetchone()[0]
+ print(f" Found {total_count:,} cadastral parcels")
+
+ if limit:
+ print(f" Limiting to {limit} records")
+
+ # Export directly to CSV
+ print(" Transforming and writing to CSV...")
+ con.execute(f"COPY ({query}) TO '{csv_path}' (HEADER, DELIMITER ',')")
+
+ # Count output rows
+ result_count = con.execute(f"SELECT COUNT(*) FROM read_csv('{csv_path}')").fetchone()[0]
+
+ con.close()
+
+ print(f" Converted {result_count:,} records to CSV")
+ return result_count
+
+ except Exception as e:
+ print(f" DuckDB conversion failed: {e}")
+ print(" Falling back to regex-based converter...")
+ return self.convert_to_csv(gml_path, csv_path, limit)
From ec0a0de79dc2813229e7f44fb055de5becef9ba0 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 22:12:18 +0000
Subject: [PATCH 05/15] =?UTF-8?q?feat:=20add=20GML=20extractor=20for=20tit?=
=?UTF-8?q?le-boundary=20datasets=20with=20ZIP=20archive=20support=20Rapid?=
=?UTF-8?q?=20local=20performance=20test=20environment=20supporting=20the?=
=?UTF-8?q?=20Polars=E2=80=91based=20transformation=20rewrite=20in=20digit?=
=?UTF-8?q?al-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
local_testing/gml_extractor.py | 50 ++++++++++++++++++++++++++++++++++
1 file changed, 50 insertions(+)
create mode 100644 local_testing/gml_extractor.py
diff --git a/local_testing/gml_extractor.py b/local_testing/gml_extractor.py
new file mode 100644
index 00000000..fb004301
--- /dev/null
+++ b/local_testing/gml_extractor.py
@@ -0,0 +1,50 @@
+"""
+GML extractor for title-boundary datasets.
+
+Handles extraction of GML files from ZIP archives.
+"""
+
+import zipfile
+from pathlib import Path
+
+
+class GMLExtractor:
+ """Extracts GML files from ZIP archives."""
+
+ @staticmethod
+ def extract_gml_from_zip(zip_path: Path, output_dir: Path) -> Path:
+ """
+ Extract GML file from ZIP archive.
+
+ Args:
+ zip_path: Path to ZIP file
+ output_dir: Directory to extract GML file to
+
+ Returns:
+ Path to extracted GML file
+
+ Raises:
+ ValueError: If no GML file found in archive
+ """
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ print(f" Extracting GML from {zip_path}")
+
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+ # Find GML file in archive
+ gml_files = [f for f in zip_ref.namelist() if f.lower().endswith('.gml')]
+
+ if not gml_files:
+ raise ValueError(f"No GML file found in {zip_path}")
+
+ gml_filename = gml_files[0]
+ print(f" Found: {gml_filename}")
+
+ # Extract to output directory
+ zip_ref.extract(gml_filename, output_dir)
+
+ gml_path = output_dir / gml_filename
+ size_mb = gml_path.stat().st_size / (1024 * 1024)
+ print(f" Extracted: {gml_path} ({size_mb:.1f} MB)")
+
+ return gml_path
From 92558deba0115c81501883aa9df8a439c45f85ba Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 22:12:32 +0000
Subject: [PATCH 06/15] =?UTF-8?q?feat:=20add=20Makefile=20for=20title=20bo?=
=?UTF-8?q?undary=20pipeline=20setup=20and=20management=20Rapid=20local=20?=
=?UTF-8?q?performance=20test=20environment=20supporting=20the=20Polars?=
=?UTF-8?q?=E2=80=91based=20transformation=20rewrite=20in=20digital-land-p?=
=?UTF-8?q?ython=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
local_testing/Makefile | 96 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 96 insertions(+)
create mode 100644 local_testing/Makefile
diff --git a/local_testing/Makefile b/local_testing/Makefile
new file mode 100644
index 00000000..7b564da0
--- /dev/null
+++ b/local_testing/Makefile
@@ -0,0 +1,96 @@
+.PHONY: help init setup-dirs setup-spec check-spec list run fast compare test clean clean-all
+
+PYTHON := venv/bin/python3
+PIP := venv/bin/pip
+SPEC_DIR := ../specification
+
+help:
+ @echo "Title Boundary Pipeline"
+ @echo ""
+ @echo " make init First time setup (dirs + venv + spec check)"
+ @echo " make setup-dirs Create all required directories"
+ @echo " make setup-spec Clone specification files from GitHub"
+ @echo " make check-spec Check if specification files exist"
+ @echo " make list List available Local Authorities"
+ @echo " make run LA=Name Process a Local Authority"
+ @echo " make fast LA=Name Process with DuckDB + Parquet"
+ @echo " make compare LA=Name Run original + Polars for comparison"
+ @echo " make test Test all module imports"
+ @echo " make clean Remove generated data"
+ @echo " make clean-all Remove data + venv"
+ @echo ""
+ @echo "Examples:"
+ @echo " make init # Complete setup"
+ @echo " make setup-spec # Clone specification if missing"
+ @echo " make run LA=Buckinghamshire LIMIT=100"
+ @echo " make fast LA=\"East Sussex\""
+ @echo " make compare LA=Buckinghamshire LIMIT=1000"
+
+setup-dirs:
+ @mkdir -p raw extracted converted output reports cache pipeline
+ @echo "ā
Created directories: raw/ extracted/ converted/ output/ reports/ cache/ pipeline/"
+
+setup-spec:
+ @if [ -d "$(SPEC_DIR)" ]; then \
+ echo "ā
Specification already exists at $(SPEC_DIR)"; \
+ else \
+ echo "š„ Cloning specification from GitHub..."; \
+ cd .. && git clone https://github.com/digital-land/specification.git; \
+ if [ -d "$(SPEC_DIR)" ]; then \
+ echo "ā
Specification cloned successfully"; \
+ echo " Files: $$(ls -1 $(SPEC_DIR)/*.csv 2>/dev/null | wc -l | tr -d ' ') CSV files"; \
+ else \
+ echo "ā Failed to clone specification"; \
+ exit 1; \
+ fi \
+ fi
+
+check-spec:
+ @if [ -d "$(SPEC_DIR)" ]; then \
+ echo "ā
Specification found at $(SPEC_DIR)"; \
+ echo " Files: $$(ls -1 $(SPEC_DIR)/*.csv 2>/dev/null | wc -l | tr -d ' ') CSV files"; \
+ else \
+ echo "ā Specification not found at $(SPEC_DIR)"; \
+ echo ""; \
+ echo "Run 'make setup-spec' to clone automatically, or:"; \
+ echo " cd ../"; \
+ echo " git clone https://github.com/digital-land/specification.git"; \
+ echo ""; \
+ exit 1; \
+ fi
+
+init: setup-dirs venv setup-spec
+ @echo "ā
Setup complete - ready to run pipeline"
+
+venv:
+ @python3 -m venv venv
+ @$(PIP) install -q --upgrade pip
+ @$(PIP) install -q polars duckdb requests
+ @$(PIP) install -q -e ..
+ @echo " ā Installed digital-land-python in editable mode"
+
+list: venv
+ @$(PYTHON) main.py --list
+
+run: venv
+ @test -n "$(LA)" || (echo "Error: make run LA=Name"; exit 1)
+ @$(PYTHON) main.py --la "$(LA)" $(if $(LIMIT),--limit $(LIMIT))
+
+fast: venv
+ @test -n "$(LA)" || (echo "Error: make fast LA=Name"; exit 1)
+ @$(PYTHON) main.py --la "$(LA)" --use-duckdb --use-parquet $(if $(LIMIT),--limit $(LIMIT))
+
+compare: venv
+ @test -n "$(LA)" || (echo "Error: make compare LA=Name"; exit 1)
+ @$(PYTHON) main.py --la "$(LA)" --compare $(if $(LIMIT),--limit $(LIMIT))
+
+test: venv
+ @$(PYTHON) -c "from cli import CLI; from file_downloader import FileDownloader; from gml_extractor import GMLExtractor; from gml_converter import GMLConverter; from pipeline_config import PipelineConfig; from pipeline_runner import PipelineRunner; from pipeline_report import PipelineReport; print('ā
All modules OK')"
+
+clean:
+ @rm -rf raw/* extracted/* converted/* output/* reports/*
+ @echo "ā
Data cleaned"
+
+clean-all: clean
+ @rm -rf venv/ cache/*
+ @echo "ā
All cleaned"
From 52be8abd7f8e838d7ab44b8f2634091fa7d612a6 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 22:13:12 +0000
Subject: [PATCH 07/15] =?UTF-8?q?feat:=20add=20pipeline=20configuration=20?=
=?UTF-8?q?management=20for=20title-boundary=20dataset=20Rapid=20local=20p?=
=?UTF-8?q?erformance=20test=20environment=20supporting=20the=20Polars?=
=?UTF-8?q?=E2=80=91based=20transformation=20rewrite=20in=20digital-land-p?=
=?UTF-8?q?ython=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
local_testing/pipeline_config.py | 93 ++++++++++++++++++++++++++++++++
1 file changed, 93 insertions(+)
create mode 100644 local_testing/pipeline_config.py
diff --git a/local_testing/pipeline_config.py b/local_testing/pipeline_config.py
new file mode 100644
index 00000000..35aaf338
--- /dev/null
+++ b/local_testing/pipeline_config.py
@@ -0,0 +1,93 @@
+"""
+Pipeline configuration management for title-boundary dataset.
+
+Handles creation and management of pipeline configuration CSV files
+and downloading of required resources like organisation.csv.
+"""
+
+import urllib.request
+from pathlib import Path
+
+
+class PipelineConfig:
+ """Manages pipeline configuration files and resources."""
+
+ @staticmethod
+ def ensure_pipeline_config(pipeline_dir: Path):
+ """
+ Ensure all required pipeline configuration CSV files exist.
+
+ Creates default configuration files for:
+ - column mapping
+ - default values
+ - patches, concatenations, combinations
+ - filters, lookups, migrations, redirects
+
+ Args:
+ pipeline_dir: Directory where pipeline config files should be created
+ """
+ pipeline_dir.mkdir(parents=True, exist_ok=True)
+
+ configs = {
+ "column.csv": """dataset,resource,column,field
+title-boundary,,reference,reference
+title-boundary,,name,name
+title-boundary,,geometry,geometry
+title-boundary,,start-date,start-date
+title-boundary,,entry-date,entry-date
+title-boundary,,end-date,end-date
+title-boundary,,prefix,prefix
+title-boundary,,organisation,organisation
+title-boundary,,notes,notes
+title-boundary,,national-cadastral-reference,notes
+""",
+ "default.csv": "dataset,resource,field,default-field,entry-date\n",
+ "patch.csv": "dataset,resource,field,pattern,value\n",
+ "concat.csv": "dataset,resource,field,fields,separator\n",
+ "combine.csv": "dataset,resource,field,fields,separator\n",
+ "convert.csv": "dataset,resource,field,value,replacement\n",
+ "filter.csv": "dataset,resource,field,pattern\n",
+ "skip.csv": "dataset,resource,pattern\n",
+ "lookup.csv": "prefix,resource,organisation,reference,entity\n",
+ "migrate.csv": "dataset,old-field,new-field\n",
+ "redirect.csv": "entity,status,redirect-entity\n",
+ }
+
+ for filename, content in configs.items():
+ filepath = pipeline_dir / filename
+ if not filepath.exists():
+ filepath.write_text(content)
+
+ @staticmethod
+ def download_organisation_csv(cache_dir: Path) -> Path:
+ """
+ Download organisation.csv from digital-land repository if not present.
+
+ Falls back to creating a minimal organisation.csv with Land Registry
+ data if download fails.
+
+ Args:
+ cache_dir: Directory where organisation.csv should be cached
+
+ Returns:
+ Path to organisation.csv file
+ """
+ org_csv = cache_dir / "organisation.csv"
+
+ if not org_csv.exists():
+ print(" Downloading organisation.csv...")
+ url = "https://raw.githubusercontent.com/digital-land/organisation-dataset/main/collection/organisation.csv"
+
+ try:
+ cache_dir.mkdir(parents=True, exist_ok=True)
+ urllib.request.urlretrieve(url, org_csv)
+ print(f" Downloaded organisation.csv ({org_csv.stat().st_size} bytes)")
+ except Exception as e:
+ print(f" Warning: Could not download ({e}), creating minimal file")
+ org_csv.write_text(
+ "organisation,name,statistical-geography,opendatacommunities-uri\n"
+ "government-organisation:D2,Land Registry,E92000001,"
+ "http://opendatacommunities.org/id/government-organisation/land-registry\n"
+ )
+
+ return org_csv
From f391856ca54ff44dd33c66dbbee67a29de81deaa Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 2 Feb 2026 22:19:56 +0000
Subject: [PATCH 08/15] =?UTF-8?q?feat:=20update=20.gitignore=20to=20includ?=
=?UTF-8?q?e=20local=20testing=20scripts=20and=20README=20Rapid=20local=20?=
=?UTF-8?q?performance=20test=20environment=20supporting=20the=20Polars?=
=?UTF-8?q?=E2=80=91based=20transformation=20rewrite=20in=20digital-land-p?=
=?UTF-8?q?ython=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.gitignore | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/.gitignore b/.gitignore
index 0d82846f..ea7b0f32 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,3 +49,8 @@ notebooks/.ipynb_checkpoints
/local_testing/reports/
/local_testing/specification/
/local_testing/venv/
+
+/local_testing/main.py
+/local_testing/pipeline_report.py
+/local_testing/pipeline_runner.py
+/local_testing/README.md
From 82857ad5f44e2c80915a3f1ef72bc738dbf6a7fc Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Fri, 6 Feb 2026 11:38:03 +0000
Subject: [PATCH 09/15] =?UTF-8?q?feat:=20enhance=20Makefile=20and=20CLI=20?=
=?UTF-8?q?for=20improved=20pipeline=20comparison=20and=20add=20run=5Fall?=
=?UTF-8?q?=20script=20for=20batch=20processing=20Rapid=20local=20performa?=
=?UTF-8?q?nce=20test=20environment=20supporting=20the=20Polars=E2=80=91ba?=
=?UTF-8?q?sed=20transformation=20rewrite=20in=20digital-land-python=20Fix?=
=?UTF-8?q?es=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
local_testing/Makefile | 24 +++++----
local_testing/cli.py | 5 ++
local_testing/run_all.py | 113 +++++++++++++++++++++++++++++++++++++++
3 files changed, 133 insertions(+), 9 deletions(-)
create mode 100755 local_testing/run_all.py
diff --git a/local_testing/Makefile b/local_testing/Makefile
index 7b564da0..53835a7a 100644
--- a/local_testing/Makefile
+++ b/local_testing/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help init setup-dirs setup-spec check-spec list run fast compare test clean clean-all
+.PHONY: help init setup-dirs setup-spec check-spec list run run-all fast compare test clean clean-all
PYTHON := venv/bin/python3
PIP := venv/bin/pip
@@ -12,19 +12,22 @@ help:
@echo " make setup-spec Clone specification files from GitHub"
@echo " make check-spec Check if specification files exist"
@echo " make list List available Local Authorities"
- @echo " make run LA=Name Process a Local Authority"
- @echo " make fast LA=Name Process with DuckDB + Parquet"
- @echo " make compare LA=Name Run original + Polars for comparison"
+ @echo " make run LA=Name Process with comparison (Original + Polars)"
+ @echo " make run-all Process ALL LAs with comparison"
+ @echo " make fast LA=Name DuckDB+Parquet with comparison"
@echo " make test Test all module imports"
@echo " make clean Remove generated data"
@echo " make clean-all Remove data + venv"
@echo ""
+ @echo "Note: All run commands automatically include Polars comparison"
+ @echo ""
@echo "Examples:"
@echo " make init # Complete setup"
@echo " make setup-spec # Clone specification if missing"
- @echo " make run LA=Buckinghamshire LIMIT=100"
- @echo " make fast LA=\"East Sussex\""
- @echo " make compare LA=Buckinghamshire LIMIT=1000"
+ @echo " make run LA=Buckinghamshire LIMIT=100 # Compare both pipelines"
+ @echo " make run LA=Buckinghamshire PHASES=1,2,9 # Run specific phases"
+ @echo " make run-all LIMIT=100 # Process all LAs with comparison"
+ @echo " make fast LA=\"East Sussex\" # Fast mode with comparison"
setup-dirs:
@mkdir -p raw extracted converted output reports cache pipeline
@@ -74,11 +77,14 @@ list: venv
run: venv
@test -n "$(LA)" || (echo "Error: make run LA=Name"; exit 1)
- @$(PYTHON) main.py --la "$(LA)" $(if $(LIMIT),--limit $(LIMIT))
+ @$(PYTHON) main.py --la "$(LA)" --compare $(if $(LIMIT),--limit $(LIMIT)) $(if $(PHASES),--phases $(PHASES))
+
+run-all: venv
+ @$(PYTHON) run_all.py $(LIMIT)
fast: venv
@test -n "$(LA)" || (echo "Error: make fast LA=Name"; exit 1)
- @$(PYTHON) main.py --la "$(LA)" --use-duckdb --use-parquet $(if $(LIMIT),--limit $(LIMIT))
+ @$(PYTHON) main.py --la "$(LA)" --use-duckdb --use-parquet --compare $(if $(LIMIT),--limit $(LIMIT))
compare: venv
@test -n "$(LA)" || (echo "Error: make compare LA=Name"; exit 1)
diff --git a/local_testing/cli.py b/local_testing/cli.py
index 4ab74687..2eb0f774 100644
--- a/local_testing/cli.py
+++ b/local_testing/cli.py
@@ -62,6 +62,11 @@ def create_parser() -> argparse.ArgumentParser:
action="store_true",
help="Run both original and Polars pipelines for performance comparison",
)
+ parser.add_argument(
+ "--phases",
+ type=str,
+ help="Comma-separated phase numbers to run (e.g. '1,2,9' or '1-5,9')",
+ )
return parser
diff --git a/local_testing/run_all.py b/local_testing/run_all.py
new file mode 100755
index 00000000..19856b00
--- /dev/null
+++ b/local_testing/run_all.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""
+Script to run the pipeline for all Local Authorities.
+"""
+
+import sys
+import subprocess
+import time
+import json
+from pathlib import Path
+from datetime import datetime
+from cli import CLI
+
+
+def main():
+ """Run pipeline for all Local Authorities."""
+ # Get limit from command line if provided
+ limit = None
+ if len(sys.argv) > 1:
+ limit = sys.argv[1]
+
+ # Fetch all endpoints
+ print("Fetching endpoint list...")
+ endpoints = CLI.fetch_endpoint_list()
+ print(f"Found {len(endpoints)} Local Authorities")
+ print(f"Running with Polars comparison enabled\n")
+
+ success_count = 0
+ error_count = 0
+ errors = []
+ la_times = []
+ batch_start = time.time()
+
+ for i, ep in enumerate(endpoints, 1):
+ la = ep.get("local_authority", "Unknown")
+ print(f"\n{'='*60}")
+ print(f"[{i}/{len(endpoints)}] Processing: {la}")
+ print(f"{'='*60}")
+
+ # Build command with --compare flag for Polars
+ cmd = [sys.executable, "main.py", "--la", la, "--compare"]
+ if limit:
+ cmd.extend(["--limit", limit])
+
+ # Time this LA
+ la_start = time.time()
+ result = subprocess.run(cmd)
+ la_duration = time.time() - la_start
+
+ if result.returncode != 0:
+ print(f" ā ļø Error processing {la}")
+ error_count += 1
+ errors.append(la)
+ la_times.append({"la": la, "duration": la_duration, "status": "error"})
+ else:
+ print(f" ā
Completed {la} ({la_duration:.1f}s)")
+ success_count += 1
+ la_times.append({"la": la, "duration": la_duration, "status": "success"})
+
+ # Calculate batch metrics
+ batch_duration = time.time() - batch_start
+ avg_duration = sum(t["duration"] for t in la_times) / len(la_times) if la_times else 0
+ successful_times = [t["duration"] for t in la_times if t["status"] == "success"]
+
+ # Summary
+ print(f"\n{'='*60}")
+ print("BATCH PROCESSING COMPLETE (with Polars Comparison)")
+ print(f"{'='*60}")
+ print(f" Total LAs: {len(endpoints)}")
+ print(f" Success: {success_count}")
+ print(f" Errors: {error_count}")
+ print(f" Total Time: {batch_duration:.1f}s ({batch_duration/60:.1f}m)")
+ print(f" Avg Time/LA: {avg_duration:.1f}s")
+ if successful_times:
+ print(f" Min Time: {min(successful_times):.1f}s")
+ print(f" Max Time: {max(successful_times):.1f}s")
+ print(f"\n Note: All LAs processed with both Original + Polars pipelines")
+
+ if errors:
+ print(f"\nFailed Local Authorities:")
+ for la in errors:
+ print(f" - {la}")
+
+ # Save batch report
+ reports_dir = Path(__file__).parent / "reports"
+ reports_dir.mkdir(parents=True, exist_ok=True)
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+ batch_report = {
+ "batch_timestamp": timestamp,
+ "total_las": len(endpoints),
+ "success_count": success_count,
+ "error_count": error_count,
+ "batch_duration_seconds": batch_duration,
+ "average_duration_seconds": avg_duration,
+ "polars_comparison_enabled": True,
+ "limit": limit,
+ "la_results": la_times,
+ "errors": errors
+ }
+
+ batch_json = reports_dir / f"batch_{timestamp}_summary.json"
+ with open(batch_json, "w") as f:
+ json.dump(batch_report, f, indent=2)
+
+ print(f"\nBatch report saved: {batch_json}")
+ print(f"{'='*60}\n")
+
+ return 1 if error_count > 0 else 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
From bfeb931e2ba2dc34cfcbe20b0797cfdfa49e4974 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Fri, 6 Feb 2026 11:42:23 +0000
Subject: [PATCH 10/15] =?UTF-8?q?feat:=20update=20pipeline=20configuration?=
=?UTF-8?q?=20files=20and=20add=20README=20for=20local=20testing=20environ?=
=?UTF-8?q?ment=20Rapid=20local=20performance=20test=20environment=20suppo?=
=?UTF-8?q?rting=20the=20Polars=E2=80=91based=20transformation=20rewrite?=
=?UTF-8?q?=20in=20digital-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.gitignore | 3 +-
local_testing/README.md | 311 ++++++++++++++++++++++++++++
local_testing/pipeline/column.csv | 11 +
local_testing/pipeline/combine.csv | 1 +
local_testing/pipeline/concat.csv | 1 +
local_testing/pipeline/convert.csv | 1 +
local_testing/pipeline/default.csv | 2 +
local_testing/pipeline/filter.csv | 1 +
local_testing/pipeline/lookup.csv | 11 +
local_testing/pipeline/migrate.csv | 1 +
local_testing/pipeline/patch.csv | 1 +
local_testing/pipeline/redirect.csv | 1 +
local_testing/pipeline/skip.csv | 1 +
13 files changed, 344 insertions(+), 2 deletions(-)
create mode 100644 local_testing/README.md
create mode 100644 local_testing/pipeline/column.csv
create mode 100644 local_testing/pipeline/combine.csv
create mode 100644 local_testing/pipeline/concat.csv
create mode 100644 local_testing/pipeline/convert.csv
create mode 100644 local_testing/pipeline/default.csv
create mode 100644 local_testing/pipeline/filter.csv
create mode 100644 local_testing/pipeline/lookup.csv
create mode 100644 local_testing/pipeline/migrate.csv
create mode 100644 local_testing/pipeline/patch.csv
create mode 100644 local_testing/pipeline/redirect.csv
create mode 100644 local_testing/pipeline/skip.csv
diff --git a/.gitignore b/.gitignore
index ea7b0f32..d65ec89a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,7 +43,6 @@ notebooks/.ipynb_checkpoints
/local_testing/converted/
/local_testing/extracted/
/local_testing/output/
-/local_testing/pipeline/
/local_testing/polars_phases/
/local_testing/raw/
/local_testing/reports/
@@ -53,4 +52,4 @@ notebooks/.ipynb_checkpoints
/local_testing/main.py
/local_testing/pipeline_report.py
/local_testing/pipeline_runner.py
-/local_testing/README.md
+
diff --git a/local_testing/README.md b/local_testing/README.md
new file mode 100644
index 00000000..41de9085
--- /dev/null
+++ b/local_testing/README.md
@@ -0,0 +1,311 @@
+# Digital Land Pipeline - Local Testing
+
+A modular, self-contained environment for testing the digital-land transformation pipeline
+on various datasets (e.g., UK Land Registry title-boundary data).
+
+## Architecture
+
+The pipeline uses a **clean modular architecture** with 8 specialized classes:
+
+1. **CLI** (121 lines) - Command-line interface and argument parsing
+2. **FileDownloader** (95 lines) - Downloads GML files from data sources
+3. **GMLExtractor** (50 lines) - Extracts GML from ZIP archives
+4. **GMLConverter** (458 lines) - Converts GML to CSV/Parquet (4 strategies)
+5. **PipelineConfig** (93 lines) - Manages pipeline configuration files
+6. **PipelineRunner** (254 lines) - Executes 26-phase digital-land transformation
+7. **PipelineReport** (346 lines) - Performance tracking and reporting
+8. **main.py** (265 lines) - Orchestrates the pipeline by calling specialized classes
+
+**Total**: 2,449 lines across 9 focused, testable modules (down from 1,688 monolithic lines)
+
+## Prerequisites
+
+### Specification Files
+
+The pipeline requires specification files from the digital-land specification repository. These files define schemas, fields, datatypes, and pipeline configurations.
+
+**Files Used (11 of 25):**
+- `dataset.csv`, `schema.csv`, `dataset-schema.csv`
+- `datatype.csv`, `field.csv`, `dataset-field.csv`, `schema-field.csv`
+- `typology.csv`, `pipeline.csv`, `licence.csv`, `provision-rule.csv`
+
+The remaining 14 files are not loaded by the pipeline but may be used by other digital-land tools.
+
+## Quick Start
+
+**Using Makefile (Recommended):**
+
+```bash
+# Navigate to directory
+cd digital-land-python/local_testing
+
+# First time setup - automatically creates directories, installs dependencies, and clones specification
+make init
+
+# If specification already exists elsewhere, you can symlink it instead:
+# cd digital-land-python
+# ln -s /path/to/your/specification specification
+
+# Verify setup
+make check-spec
+
+# List available Local Authorities (for title-boundary dataset)
+make list
+
+# Process a specific LA (includes Polars comparison automatically)
+make run LA="Buckinghamshire"
+
+# Process with record limit
+make run LA="Buckinghamshire" LIMIT=100
+
+# Process ALL Local Authorities (batch mode with comparison)
+make run-all
+
+# Process all with record limit (for testing)
+make run-all LIMIT=100
+
+# Run only specific phases (e.g., phases 1,2,9)
+make run LA="Buckinghamshire" PHASES="1,2,9"
+
+# Run range of phases (e.g., phases 1-5 and 9)
+make run LA="Buckinghamshire" PHASES="1-5,9"
+
+# Use best performance (DuckDB + Parquet, includes comparison)
+make fast LA="Buckinghamshire"
+
+# See all available commands
+make help
+
+# Note: All run commands automatically include Polars comparison
+```
+
+**Manual Setup (Alternative):**
+
+```bash
+# Navigate to local testing directory
+cd digital-land-python/local_testing
+
+# Create virtual environment (first time only)
+python3 -m venv venv
+
+# Activate virtual environment
+source venv/bin/activate
+
+# Install dependencies (first time only)
+pip install polars duckdb
+
+# List available items (dataset-specific)
+python main.py --list
+
+# Process a specific item
+python main.py --la "Buckinghamshire"
+
+# Process with record limit (for testing)
+python main.py --la "Buckinghamshire" --limit 100
+
+# Skip download if already have the file
+python main.py --la "Buckinghamshire" --skip-download
+
+# Use DuckDB with Parquet for best performance
+python main.py --la "Buckinghamshire" --use-duckdb --use-parquet
+```
+
+## What It Does
+
+The pipeline performs 5 steps:
+
+1. **Download** (FileDownloader) - Fetches data files from source API
+2. **Extract** (GMLExtractor) - Unzips and locates GML files
+3. **Convert** (GMLConverter) - Parses GML and converts to CSV/Parquet (4 methods available)
+4. **Transform** (PipelineRunner) - Runs full 26-phase digital-land pipeline
+5. **Report** (PipelineReport) - Generates performance report (JSON + text)
+
+Each step delegates to a specialized class for clean separation of concerns.
+
+## Directory Structure
+
+```
+local_testing/
+āāā main.py # Main orchestration (265 lines)
+āāā cli.py # Command-line interface (121 lines)
+āāā file_downloader.py # Downloads GML files (95 lines)
+āāā gml_extractor.py # ZIP extraction (50 lines)
+āāā gml_converter.py # GML conversion (458 lines)
+āāā pipeline_config.py # Config management (93 lines)
+āāā pipeline_runner.py # 26-phase transformation (254 lines)
+āāā pipeline_report.py # Performance tracking (346 lines)
+āāā polars_phases.py # Polars-optimized phases (767 lines)
+āāā Makefile # Make commands for easy setup and running
+āāā README.md # This file
+āāā .gitignore # Git ignore file
+āāā venv/ # Virtual environment (created with: make init)
+āāā raw/ # Downloaded ZIP files
+āāā extracted/ # Extracted GML files
+āāā converted/ # GML converted to CSV/Parquet
+āāā output/ # Pipeline output (harmonised + facts)
+āāā reports/ # Performance reports
+āāā cache/ # Organisation.csv cache
+āāā pipeline/ # Pipeline configuration CSVs
+āāā specification/ # digital-land specification files
+āāā scripts/ # Helper scripts
+```
+
+## Module Overview
+
+### CLI (`cli.py`)
+- Argument parsing with `argparse`
+- Fetches endpoint list from GitHub
+- Lists and matches data items
+- Clean separation of UI logic
+
+### FileDownloader (`file_downloader.py`)
+- Downloads files from APIs
+- Progress tracking with byte counts
+- Reusable for any file download needs
+
+### GMLExtractor (`gml_extractor.py`)
+- Extracts GML files from ZIP archives
+- Handles nested directory structures
+- Simple, focused responsibility
+
+### GMLConverter (`gml_converter.py`)
+- **4 conversion strategies**:
+ 1. Regex ā CSV (default, no dependencies)
+ 2. Regex ā Parquet (Polars)
+ 3. DuckDB ā CSV (spatial extension)
+ 4. DuckDB ā Parquet (fastest, best)
+- Parses GML polygons to WKT
+- Handles coordinate transformation
+
+### PipelineConfig (`pipeline_config.py`)
+- Creates pipeline configuration CSVs
+- Downloads organization.csv
+- Ensures all config files exist
+
+### PipelineRunner (`pipeline_runner.py`)
+- Executes 26-phase digital-land pipeline
+- Lazy imports for fast startup
+- Per-phase timing and metrics
+- Handles Parquet/CSV input
+
+### PipelineReport (`pipeline_report.py`)
+- Tracks step and phase metrics
+- Generates JSON and text reports
+- Calculates durations and throughput
+- Supports comparison reporting
+
+## Output Files
+
+After running the pipeline, you will find:
+
+**Pipeline Output:**
+- `output/{name}_harmonised.csv` - Intermediate harmonised data
+- `output/{name}_facts.csv` - Final fact table output
+- `output/{name}_issues.csv` - Any issues logged during processing
+
+**Performance Reports:**
+
+1. **Single LA Report** (default)
+ - `reports/{name}_{timestamp}_performance.json` - Detailed JSON report
+ - `reports/{name}_{timestamp}_performance.txt` - Human-readable text report
+ - Shows timing for all 26 phases
+
+2. **Selective Phase Report** (when using `--phases`)
+ - Same format as above
+ - Only includes metrics for selected phases
+ - Useful for testing specific transformations
+
+3. **Batch Summary Report** (when using `make run-all`)
+ - `reports/batch_{timestamp}_summary.json` - Aggregate metrics for entire batch
+ - Includes total time, per-LA timing, success/error counts
+ - Shows min/max/average processing times across all LAs
+ - **All run commands now include automatic Polars comparison** (both Original + Polars pipelines)
+
+## Command Line Options
+
+| Option | Description |
+|--------|-------------|
+| `--la NAME` | Item name (partial match) |
+| `--limit N` | Limit number of records to process |
+| `--skip-download` | Use existing downloaded data |
+| `--list` | List all available items |
+| `--use-duckdb` | Use DuckDB with spatial extension for GML conversion (faster, proper CRS transform) |
+| `--use-parquet` | Output Parquet instead of CSV (faster reads, smaller files) |
+| `--phases` | Run specific phases (e.g., `1,2,9` or `1-5,9`) |
+| `--compare` | Run both original and Polars pipelines for comparison (enabled by default in Makefile) |
+
+## GML Conversion Methods
+
+The **GMLConverter** class supports multiple conversion strategies:
+
+### Output Formats
+
+| Format | Flag | Advantages |
+|--------|------|------------|
+| **CSV** | (default) | Universal, human-readable |
+| **Parquet** | `--use-parquet` | 3-10x smaller, faster reads, preserves types |
+
+### Conversion Engines
+
+| Engine | Flag | Speed | Features |
+|--------|------|-------|----------|
+| **Regex** | (default) | Slow | No dependencies |
+| **DuckDB** | `--use-duckdb` | Fast | Proper CRS transform, spatial extension |
+
+### Best Performance
+
+For the fastest conversion, use DuckDB with Parquet output:
+
+```bash
+# Best performance: DuckDB ā Parquet
+python main.py --la "Buckinghamshire" --use-duckdb --use-parquet
+```
+
+## Testing the Modular Architecture
+
+All classes are independently testable:
+
+```bash
+# Navigate to directory
+cd digital-land-python/local_testing
+
+# Activate venv
+source venv/bin/activate
+
+# Verify all modules work
+python3 -c "
+from cli import CLI
+from file_downloader import FileDownloader
+from gml_extractor import GMLExtractor
+from gml_converter import GMLConverter
+from pipeline_config import PipelineConfig
+from pipeline_runner import PipelineRunner
+from pipeline_report import PipelineReport
+print('ā
All modules imported successfully')
+"
+```
+
+## Notes
+
+- Virtual environment should be created in `local_testing/venv/`
+- Entity assignment requires a lookup table (`pipeline/lookup.csv`)
+- Without lookups, harmonised data will have empty entity field
+- Facts output will be empty without entity lookups
+- Coordinates are converted from OSGB (EPSG:27700) to WGS84
+- Requirements: `pip install polars duckdb`
+- Parquet uses Snappy compression by default
+- Add `venv/` to `.gitignore` to avoid committing virtual environment
+- **Reusable for other datasets** - Just update the endpoint URL in CLI or main.py
+
+## Development
+
+Each module can be modified independently:
+
+- **Add new conversion method**: Edit `GMLConverter.convert_to_*()` methods
+- **Change CLI options**: Edit `CLI.create_parser()`
+- **Add new pipeline phases**: Edit `PipelineRunner.run_full_pipeline()`
+- **Modify reporting**: Edit `PipelineReport` metrics and output formats
+- **Add new data sources**: Create new downloader classes following `FileDownloader` pattern
+- **Adapt for new datasets**: Update endpoint URLs and field mappings in relevant classes
+
+The modular structure makes it easy to extend and test each component in isolation.
diff --git a/local_testing/pipeline/column.csv b/local_testing/pipeline/column.csv
new file mode 100644
index 00000000..ee2a9062
--- /dev/null
+++ b/local_testing/pipeline/column.csv
@@ -0,0 +1,11 @@
+dataset,resource,column,field
+title-boundary,,reference,reference
+title-boundary,,name,name
+title-boundary,,geometry,geometry
+title-boundary,,start-date,start-date
+title-boundary,,entry-date,entry-date
+title-boundary,,end-date,end-date
+title-boundary,,prefix,prefix
+title-boundary,,organisation,organisation
+title-boundary,,notes,notes
+title-boundary,,national-cadastral-reference,notes
diff --git a/local_testing/pipeline/combine.csv b/local_testing/pipeline/combine.csv
new file mode 100644
index 00000000..cae5fefa
--- /dev/null
+++ b/local_testing/pipeline/combine.csv
@@ -0,0 +1 @@
+dataset,resource,field,fields,separator
diff --git a/local_testing/pipeline/concat.csv b/local_testing/pipeline/concat.csv
new file mode 100644
index 00000000..cae5fefa
--- /dev/null
+++ b/local_testing/pipeline/concat.csv
@@ -0,0 +1 @@
+dataset,resource,field,fields,separator
diff --git a/local_testing/pipeline/convert.csv b/local_testing/pipeline/convert.csv
new file mode 100644
index 00000000..926bf51e
--- /dev/null
+++ b/local_testing/pipeline/convert.csv
@@ -0,0 +1 @@
+dataset,resource,field,value,replacement
diff --git a/local_testing/pipeline/default.csv b/local_testing/pipeline/default.csv
new file mode 100644
index 00000000..8f30d573
--- /dev/null
+++ b/local_testing/pipeline/default.csv
@@ -0,0 +1,2 @@
+dataset,resource,field,default-field,entry-date
+title-boundary,,,entry-date,
diff --git a/local_testing/pipeline/filter.csv b/local_testing/pipeline/filter.csv
new file mode 100644
index 00000000..a9802699
--- /dev/null
+++ b/local_testing/pipeline/filter.csv
@@ -0,0 +1 @@
+dataset,resource,field,pattern
diff --git a/local_testing/pipeline/lookup.csv b/local_testing/pipeline/lookup.csv
new file mode 100644
index 00000000..fee3f4f0
--- /dev/null
+++ b/local_testing/pipeline/lookup.csv
@@ -0,0 +1,11 @@
+prefix,resource,organisation,reference,entity
+title-boundary,,,33205373,12000000001
+title-boundary,,,60898175,12000000002
+title-boundary,,,33209075,12000000003
+title-boundary,,,55955680,12000000004
+title-boundary,,,37316451,12000000005
+title-boundary,,,26291037,12000000006
+title-boundary,,,30556652,12000000007
+title-boundary,,,42046003,12000000008
+title-boundary,,,32896399,12000000009
+title-boundary,,,42173303,12000000010
diff --git a/local_testing/pipeline/migrate.csv b/local_testing/pipeline/migrate.csv
new file mode 100644
index 00000000..728e7bbc
--- /dev/null
+++ b/local_testing/pipeline/migrate.csv
@@ -0,0 +1 @@
+dataset,old-field,new-field
diff --git a/local_testing/pipeline/patch.csv b/local_testing/pipeline/patch.csv
new file mode 100644
index 00000000..478c396a
--- /dev/null
+++ b/local_testing/pipeline/patch.csv
@@ -0,0 +1 @@
+dataset,resource,field,pattern,value
diff --git a/local_testing/pipeline/redirect.csv b/local_testing/pipeline/redirect.csv
new file mode 100644
index 00000000..d3d9f670
--- /dev/null
+++ b/local_testing/pipeline/redirect.csv
@@ -0,0 +1 @@
+entity,status,redirect-entity
diff --git a/local_testing/pipeline/skip.csv b/local_testing/pipeline/skip.csv
new file mode 100644
index 00000000..d5f3eaff
--- /dev/null
+++ b/local_testing/pipeline/skip.csv
@@ -0,0 +1 @@
+dataset,resource,pattern
From 1ef4f09cca398bb8b1856d58cb65d30780670d4b Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Fri, 6 Feb 2026 11:43:56 +0000
Subject: [PATCH 11/15] =?UTF-8?q?Refactor=20pipeline=20scripts:=20remove?=
=?UTF-8?q?=20old=20main.py,=20pipeline=5Freport.py,=20and=20pipeline=5Fru?=
=?UTF-8?q?nner.py;=20add=20new=20implementations=20for=20main=20pipeline?=
=?UTF-8?q?=20orchestration=20and=20reporting=20Rapid=20local=20performanc?=
=?UTF-8?q?e=20test=20environment=20supporting=20the=20Polars=E2=80=91base?=
=?UTF-8?q?d=20transformation=20rewrite=20in=20digital-land-python=20Fixes?=
=?UTF-8?q?=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.gitignore | 4 -
local_testing/main.py | 365 +++++++++++++++++++++++
local_testing/pipeline_report.py | 477 +++++++++++++++++++++++++++++++
local_testing/pipeline_runner.py | 444 ++++++++++++++++++++++++++++
4 files changed, 1286 insertions(+), 4 deletions(-)
create mode 100644 local_testing/main.py
create mode 100644 local_testing/pipeline_report.py
create mode 100644 local_testing/pipeline_runner.py
diff --git a/.gitignore b/.gitignore
index d65ec89a..578c0446 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,7 +49,3 @@ notebooks/.ipynb_checkpoints
/local_testing/specification/
/local_testing/venv/
-/local_testing/main.py
-/local_testing/pipeline_report.py
-/local_testing/pipeline_runner.py
-
diff --git a/local_testing/main.py b/local_testing/main.py
new file mode 100644
index 00000000..34b5c586
--- /dev/null
+++ b/local_testing/main.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python3
+"""
+Title Boundary Pipeline - Download, Convert, and Transform GML data from Land Registry
+
+Orchestration script that coordinates multiple specialized classes.
+"""
+
+import sys
+import time
+from pathlib import Path
+from datetime import datetime
+
+from cli import CLI
+from file_downloader import FileDownloader
+from gml_extractor import GMLExtractor
+from gml_converter import GMLConverter
+from pipeline_config import PipelineConfig
+from pipeline_runner import PipelineRunner
+from pipeline_report import PipelineReport
+
+
+# =============================================================================
+# Constants
+# =============================================================================
+
+SCRIPT_DIR = Path(__file__).parent.resolve()
+DATASET = "title-boundary"
+
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+
+def parse_phase_selection(phases_str: str) -> set:
+ """
+ Parse phase selection string into set of phase numbers.
+
+ Args:
+ phases_str: Comma-separated phase numbers or ranges (e.g., "1,2,9" or "1-5,9")
+
+ Returns:
+ Set of selected phase numbers, or None if invalid
+ """
+ phases = set()
+ try:
+ for part in phases_str.split(","):
+ part = part.strip()
+ if "-" in part:
+ # Range: "1-5"
+ start, end = part.split("-")
+ phases.update(range(int(start), int(end) + 1))
+ else:
+ # Single phase: "9"
+ phases.add(int(part))
+
+ # Validate phase numbers (1-26)
+ if any(p < 1 or p > 26 for p in phases):
+ return None
+
+ return phases
+ except (ValueError, AttributeError):
+ return None
+
+
+# =============================================================================
+# Main Entry Point
+# =============================================================================
+
+
+def main():
+ """Main entry point for title-boundary pipeline."""
+
+ # Parse arguments using CLI class
+ parser = CLI.create_parser()
+ args = parser.parse_args()
+
+ # Setup directories
+ raw_dir = SCRIPT_DIR / "raw"
+ extracted_dir = SCRIPT_DIR / "extracted"
+ converted_dir = SCRIPT_DIR / "converted"
+ output_dir = SCRIPT_DIR / "output"
+ pipeline_dir = SCRIPT_DIR / "pipeline"
+ specification_dir = SCRIPT_DIR.parent / "specification"
+ cache_dir = SCRIPT_DIR / "cache"
+ reports_dir = SCRIPT_DIR / "reports"
+
+ for directory in [
+ raw_dir,
+ extracted_dir,
+ converted_dir,
+ output_dir,
+ pipeline_dir,
+ cache_dir,
+ reports_dir,
+ ]:
+ directory.mkdir(parents=True, exist_ok=True)
+
+ # List mode - use CLI class
+ if args.list or not args.la:
+ CLI.list_available_las()
+ if not args.la:
+ print("Use --la 'Name' to process a specific Local Authority")
+ return 0
+
+ # Find matching LA - use CLI class
+ endpoint, la_name = CLI.find_matching_la(args.la)
+ if not endpoint:
+ return 1
+
+ # Initialize
+ la_slug = la_name.lower().replace(" ", "_").replace(",", "")
+ report = PipelineReport()
+ report.local_authority = la_name
+ report.dataset = DATASET
+
+ # Print header
+ print(f"\n{'='*60}")
+ print("Title Boundary Pipeline")
+ print(f"{'='*60}")
+ print(f"Local Authority: {la_name}")
+ print(f"Endpoint: {endpoint['url']}")
+ if args.limit:
+ print(f"Limit: {args.limit:,} records")
+ print(f"{'='*60}\n")
+
+ overall_start = time.time()
+
+ # =========================================================================
+ # Step 1: Download - use FileDownloader class
+ # =========================================================================
+ print("Step 1: Download")
+ print("-" * 40)
+
+ step_download = report.add_step("Download")
+ zip_path = raw_dir / f"{la_slug}.zip"
+
+ if args.skip_download and zip_path.exists():
+ print(f" Using existing: {zip_path}")
+ step_download.mark_complete(success=True)
+ else:
+ downloader = FileDownloader()
+ success = downloader.download_file(endpoint["url"], zip_path)
+ step_download.mark_complete(success=success)
+ if not success:
+ print(" Download failed")
+ return 1
+
+ if zip_path.exists():
+ report.zip_size_mb = zip_path.stat().st_size / (1024 * 1024)
+
+ # =========================================================================
+ # Step 2: Extract - use GMLExtractor class
+ # =========================================================================
+ print("\nStep 2: Extract")
+ print("-" * 40)
+
+ step_extract = report.add_step("Extract")
+ extract_subdir = extracted_dir / la_slug
+
+ try:
+ gml_path = GMLExtractor.extract_gml_from_zip(zip_path, extract_subdir)
+ step_extract.mark_complete(success=True)
+
+ if gml_path.exists():
+ report.gml_size_mb = gml_path.stat().st_size / (1024 * 1024)
+ except Exception as e:
+ print(f" Extraction failed: {e}")
+ step_extract.mark_complete(success=False)
+ return 1
+
+ # =========================================================================
+ # Step 3: Convert - use GMLConverter class
+ # =========================================================================
+ output_format = "Parquet" if args.use_parquet else "CSV"
+ print(f"\nStep 3: Convert GML to {output_format}")
+ print("-" * 40)
+
+ step_convert = report.add_step("Convert")
+ converter = GMLConverter()
+
+ # Choose conversion method based on arguments
+ if args.use_duckdb and args.use_parquet:
+ method = "DuckDB+Parquet"
+ output_path = converted_dir / f"{la_slug}.parquet"
+ record_count = converter.convert_to_parquet_duckdb(
+ gml_path, output_path, limit=args.limit
+ )
+ elif args.use_duckdb:
+ method = "DuckDB+CSV"
+ output_path = converted_dir / f"{la_slug}.csv"
+ record_count = converter.convert_to_csv_duckdb(
+ gml_path, output_path, limit=args.limit
+ )
+ elif args.use_parquet:
+ method = "Polars+Parquet"
+ output_path = converted_dir / f"{la_slug}.parquet"
+ record_count = converter.convert_to_parquet(
+ gml_path, output_path, limit=args.limit
+ )
+ else:
+ method = "Polars+CSV"
+ output_path = converted_dir / f"{la_slug}.csv"
+ record_count = converter.convert_to_csv(gml_path, output_path, limit=args.limit)
+
+ step_convert.mark_complete(
+ success=record_count > 0, record_count=record_count, method=method
+ )
+
+ if record_count == 0:
+ print(" Conversion produced no records")
+ return 1
+
+ report.input_records = record_count
+
+ # =========================================================================
+ # Step 4: Transform - use PipelineConfig and PipelineRunner classes
+ # =========================================================================
+ print("\nStep 4: Transform through Pipeline")
+ print("-" * 40)
+
+ step_transform = report.add_step("Transform")
+
+ # Ensure configuration exists using PipelineConfig class
+ PipelineConfig.ensure_pipeline_config(pipeline_dir)
+
+ if not specification_dir.exists():
+ print(f" Error: Specification directory not found: {specification_dir}")
+ print(f" Please clone specification to: {specification_dir}")
+ step_transform.mark_complete(success=False)
+ return 1
+
+ # Parse phase selection if provided
+ selected_phases = None
+ if args.phases:
+ selected_phases = parse_phase_selection(args.phases)
+ if selected_phases:
+ print(f" Running selected phases: {sorted(selected_phases)}")
+ report.selected_phases = selected_phases # Store in report for filtering
+ else:
+ print(f" Invalid phase selection: {args.phases}")
+ step_transform.mark_complete(success=False)
+ return 1
+
+ # Run pipeline using PipelineRunner class
+ runner = PipelineRunner(dataset=DATASET)
+ results = runner.run_full_pipeline(
+ input_csv=output_path,
+ output_dir=output_dir,
+ specification_dir=specification_dir,
+ pipeline_dir=pipeline_dir,
+ cache_dir=cache_dir,
+ la_name=la_name,
+ report=report,
+ selected_phases=selected_phases,
+ )
+
+ step_transform.mark_complete(
+ success=True,
+ harmonised_records=results["harmonised"],
+ fact_records=results["facts"],
+ transform_time=results.get("transform_time", 0),
+ )
+
+ # Run Polars pipeline for comparison if requested
+ if args.compare:
+ print("\n Running Polars pipeline for comparison...")
+ from polars_phases import run_polars_pipeline, PolarsPhaseMetrics
+
+ # Define required parameters
+ field_datatype_map = {"geometry": "text"} # Simplified for now
+ intermediate_fieldnames = ["entity", "name", "geometry", "organisation"]
+ factor_fieldnames = ["entity", "fact"]
+
+ polars_harmonised = output_dir / f"{la_name}_polars_harmonised.csv"
+ polars_facts = output_dir / f"{la_name}_polars_facts.csv"
+
+ polars_start = time.time()
+ polars_metrics, polars_harm_count, polars_fact_count = run_polars_pipeline(
+ input_csv=output_path,
+ harmonised_csv=polars_harmonised,
+ facts_csv=polars_facts,
+ field_datatype_map=field_datatype_map,
+ intermediate_fieldnames=intermediate_fieldnames,
+ factor_fieldnames=factor_fieldnames,
+ dataset=DATASET,
+ selected_phases=selected_phases, # Pass phase selection to Polars
+ )
+ polars_end = time.time()
+
+ # Store Polars metrics in report
+ report.polars_phases = []
+ for metric in polars_metrics:
+ from pipeline_report import PhaseMetrics
+ phase_metric = PhaseMetrics(
+ name=metric.name,
+ phase_number=metric.phase_number,
+ start_time=0,
+ end_time=0,
+ duration_seconds=metric.duration_seconds,
+ input_count=metric.input_count,
+ output_count=metric.output_count,
+ )
+ report.polars_phases.append(phase_metric)
+
+ report.polars_harmonised_records = polars_harm_count
+ report.polars_fact_records = polars_fact_count
+ report.polars_transform_seconds = polars_end - polars_start
+
+ speedup = results.get("transform_time", 0) / report.polars_transform_seconds if report.polars_transform_seconds > 0 else 0
+ print(f" Polars transform time: {report.polars_transform_seconds:.3f}s")
+ print(f" Speedup: {speedup:.1f}x faster")
+
+ # =========================================================================
+ # Step 5: Generate Report - use PipelineReport class
+ # =========================================================================
+ overall_end = time.time()
+ report.total_duration_seconds = overall_end - overall_start
+ report.calculate_totals()
+
+ print("\nStep 5: Generate Performance Report")
+ print("-" * 40)
+
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ json_path = reports_dir / f"{la_slug}_{timestamp}_performance.json"
+ text_path = reports_dir / f"{la_slug}_{timestamp}_performance.txt"
+
+ report.save_json(json_path)
+ report.save_text(text_path)
+
+ print(f" JSON report: {json_path}")
+ print(f" Text report: {text_path}")
+
+ # =========================================================================
+ # Summary
+ # =========================================================================
+ print(f"\n{'='*60}")
+ print("PIPELINE COMPLETE")
+ print(f"{'='*60}")
+ print(f"Local Authority: {la_name}")
+ print(f"Dataset: {DATASET}")
+ print(f"Total Duration: {report.total_duration_seconds:.2f}s")
+ print(f"Input Records: {report.input_records:,}")
+ print(f"Harmonised Records: {report.harmonised_records:,}")
+ print(f"Fact Records: {report.fact_records:,}")
+
+ if report.steps:
+ print(f"\nStep Summary:")
+ for name, step in report.steps.items():
+ status = "ā" if step.success else "ā"
+ print(f" {status} {name:<20} {step.duration_seconds:8.3f}s")
+
+ if report.phases:
+ total_phase_time = sum(p.duration_seconds for p in report.phases)
+ print(
+ f"\nTransform Phases: {len(report.phases)} phases, {total_phase_time:.3f}s total"
+ )
+
+ print(f"{'='*60}\n")
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main() or 0)
diff --git a/local_testing/pipeline_report.py b/local_testing/pipeline_report.py
new file mode 100644
index 00000000..cb4b9c26
--- /dev/null
+++ b/local_testing/pipeline_report.py
@@ -0,0 +1,477 @@
+"""
+Performance reporting and metrics tracking for pipeline runs.
+
+Provides classes to track timing, resource usage, and comparison
+metrics for original vs Polars pipeline implementations.
+"""
+
+import sys
+import time
+import platform as plat
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass, field
+
+
+@dataclass
+class PhaseMetrics:
+ """Metrics for a single pipeline phase."""
+
+ name: str
+ phase_number: int
+ start_time: float = 0.0
+ end_time: float = 0.0
+ duration_seconds: float = 0.0
+ input_count: int = 0
+ output_count: int = 0
+
+ def complete(self, output_count: int = 0):
+ """Mark phase as complete and calculate duration."""
+ self.end_time = time.time()
+ self.duration_seconds = self.end_time - self.start_time
+ self.output_count = output_count
+
+
+@dataclass
+class StepMetrics:
+ """Metrics for a pipeline step (Download, Extract, Convert, Transform)."""
+
+ name: str
+ start_time: float = 0.0
+ end_time: float = 0.0
+ duration_seconds: float = 0.0
+ success: bool = True
+ details: Dict[str, Any] = field(default_factory=dict)
+
+ def start(self):
+ """Start timing this step."""
+ self.start_time = time.time()
+
+ def complete(self, **details):
+ """Mark step as complete."""
+ self.end_time = time.time()
+ self.duration_seconds = self.end_time - self.start_time
+ self.details.update(details)
+
+ def mark_complete(self, success: bool = True, **details):
+ """Mark step as complete with success status."""
+ self.end_time = time.time()
+ self.duration_seconds = self.end_time - self.start_time
+ self.success = success
+ self.details.update(details)
+
+
+@dataclass
+class PipelineReport:
+ """Complete performance report for a pipeline run."""
+
+ # Run metadata
+ run_id: str = ""
+ timestamp: str = ""
+ local_authority: str = ""
+ dataset: str = "title-boundary"
+ record_limit: Optional[int] = None
+
+ # Input/Output metrics
+ input_records: int = 0
+ harmonised_records: int = 0
+ fact_records: int = 0
+
+ # Polars comparison metrics
+ polars_harmonised_records: int = 0
+ polars_fact_records: int = 0
+ polars_phases: List[PhaseMetrics] = field(default_factory=list)
+ polars_transform_seconds: float = 0.0
+
+ # File sizes
+ zip_size_mb: float = 0.0
+ gml_size_mb: float = 0.0
+ csv_size_mb: float = 0.0
+
+ # Step timings
+ steps: Dict[str, StepMetrics] = field(default_factory=dict)
+
+ # Phase timings (transformation only)
+ phases: List[PhaseMetrics] = field(default_factory=list)
+
+ # Phase selection (if running specific phases)
+ selected_phases: Optional[set] = None
+
+ # Total timing
+ total_duration_seconds: float = 0.0
+ transform_duration_seconds: float = 0.0
+
+ # System info
+ python_version: str = ""
+ platform: str = ""
+
+ def __post_init__(self):
+ """Initialize run metadata."""
+ self.python_version = sys.version.split()[0]
+ self.platform = f"{plat.system()} {plat.release()}"
+ self.run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
+ self.timestamp = datetime.now().isoformat()
+
+ def add_step(self, name: str) -> StepMetrics:
+ """Add and start a new step."""
+ step = StepMetrics(name=name)
+ step.start()
+ self.steps[name] = step
+ return step
+
+ def add_phase(self, name: str, phase_number: int) -> PhaseMetrics:
+ """Add a new phase."""
+ phase = PhaseMetrics(
+ name=name, phase_number=phase_number, start_time=time.time()
+ )
+ self.phases.append(phase)
+ return phase
+
+ def calculate_totals(self):
+ """Calculate total durations."""
+ self.total_duration_seconds = sum(
+ s.duration_seconds for s in self.steps.values()
+ )
+ self.transform_duration_seconds = sum(p.duration_seconds for p in self.phases)
+
+ def to_dict(self) -> Dict:
+ """Convert to dictionary for JSON serialization."""
+ # Filter phases if selection is active
+ phases_to_output = self.phases
+ polars_phases_to_output = self.polars_phases
+ if self.selected_phases:
+ phases_to_output = [p for p in self.phases if p.phase_number in self.selected_phases]
+ polars_phases_to_output = [p for p in self.polars_phases if p.phase_number in self.selected_phases]
+
+ return {
+ "run_id": self.run_id,
+ "timestamp": self.timestamp,
+ "local_authority": self.local_authority,
+ "dataset": self.dataset,
+ "record_limit": self.record_limit,
+ "selected_phases": list(sorted(self.selected_phases)) if self.selected_phases else None,
+ "input_records": self.input_records,
+ "harmonised_records": self.harmonised_records,
+ "fact_records": self.fact_records,
+ "file_sizes": {
+ "zip_mb": self.zip_size_mb,
+ "gml_mb": self.gml_size_mb,
+ "csv_mb": self.csv_size_mb,
+ },
+ "timing": {
+ "total_seconds": self.total_duration_seconds,
+ "transform_seconds": self.transform_duration_seconds,
+ "polars_transform_seconds": self.polars_transform_seconds,
+ "speedup_factor": (
+ (self.transform_duration_seconds / self.polars_transform_seconds)
+ if self.polars_transform_seconds > 0
+ else 0
+ ),
+ "steps": {
+ name: {"duration_seconds": s.duration_seconds, **s.details}
+ for name, s in self.steps.items()
+ },
+ "phases": [
+ {
+ "number": p.phase_number,
+ "name": p.name,
+ "duration_seconds": p.duration_seconds,
+ "output_count": p.output_count,
+ }
+ for p in phases_to_output
+ ],
+ "polars_phases": [
+ {
+ "number": p.phase_number,
+ "name": p.name,
+ "duration_seconds": p.duration_seconds,
+ "output_count": p.output_count,
+ }
+ for p in polars_phases_to_output
+ ],
+ },
+ "comparison": {
+ "original_transform_seconds": self.transform_duration_seconds,
+ "polars_transform_seconds": self.polars_transform_seconds,
+ "speedup_factor": (
+ (self.transform_duration_seconds / self.polars_transform_seconds)
+ if self.polars_transform_seconds > 0
+ else 0
+ ),
+ "time_saved_seconds": self.transform_duration_seconds
+ - self.polars_transform_seconds,
+ },
+ "system": {
+ "python_version": self.python_version,
+ "platform": self.platform,
+ },
+ }
+
+ def generate_text_report(self) -> str:
+ """Generate human-readable text report."""
+ lines = []
+ lines.append("=" * 100)
+ lines.append("TITLE BOUNDARY PIPELINE - PERFORMANCE REPORT")
+ lines.append("=" * 100)
+ lines.append("")
+ lines.append(f"Run ID: {self.run_id}")
+ lines.append(f"Timestamp: {self.timestamp}")
+ lines.append(f"Local Authority: {self.local_authority}")
+ lines.append(f"Dataset: {self.dataset}")
+ lines.append(f"Record Limit: {self.record_limit or 'None (all records)'}")
+ lines.append("")
+
+ lines.append("-" * 100)
+ lines.append("INPUT/OUTPUT SUMMARY")
+ lines.append("-" * 100)
+ lines.append(f"Input Records: {self.input_records:,}")
+ if self.polars_phases:
+ lines.append(
+ f"Harmonised Records: {self.harmonised_records:,} (Original) / {self.polars_harmonised_records:,} (Polars)"
+ )
+ lines.append(
+ f"Fact Records: {self.fact_records:,} (Original) / {self.polars_fact_records:,} (Polars)"
+ )
+ else:
+ lines.append(f"Harmonised Records: {self.harmonised_records:,}")
+ lines.append(f"Fact Records: {self.fact_records:,}")
+ lines.append("")
+
+ lines.append("-" * 100)
+ lines.append("FILE SIZES")
+ lines.append("-" * 100)
+ lines.append(f"ZIP File: {self.zip_size_mb:,.2f} MB")
+ lines.append(f"GML File: {self.gml_size_mb:,.2f} MB")
+ lines.append(f"CSV File: {self.csv_size_mb:,.2f} MB")
+ lines.append("")
+
+ lines.append("-" * 100)
+ lines.append("STEP TIMING SUMMARY")
+ lines.append("-" * 100)
+ lines.append(f"{'Step':<20} {'Duration':>12} {'% of Total':>12}")
+ lines.append("-" * 44)
+ for name, step in self.steps.items():
+ pct = (
+ (step.duration_seconds / self.total_duration_seconds * 100)
+ if self.total_duration_seconds > 0
+ else 0
+ )
+ lines.append(f"{name:<20} {step.duration_seconds:>10.3f}s {pct:>10.1f}%")
+ lines.append("-" * 44)
+ lines.append(
+ f"{'TOTAL':<20} {self.total_duration_seconds:>10.3f}s {100.0:>10.1f}%"
+ )
+ lines.append("")
+
+ # COMBINED PHASE COMPARISON TABLE (if Polars was run)
+ if self.polars_phases:
+ lines.append("=" * 100)
+ lines.append("PHASE-BY-PHASE COMPARISON: ORIGINAL vs POLARS")
+ lines.append("=" * 100)
+
+ # Show phase selection info if applicable
+ if self.selected_phases:
+ lines.append(f"Running selected phases: {sorted(self.selected_phases)}")
+ lines.append("")
+
+ # Header
+ lines.append(
+ f"{'#':<3} {'Phase Name':<26} {'Original':>11} {'Polars':>11} {'Speedup':>10} {'Time Saved':>12} {'Orig Out':>10} {'Polars Out':>10}"
+ )
+ lines.append("-" * 100)
+
+ # Build lookup for Polars phases by name
+ polars_by_name = {p.name: p for p in self.polars_phases}
+
+ # Filter phases if selection is active
+ phases_to_display = self.phases
+ if self.selected_phases:
+ phases_to_display = [p for p in self.phases if p.phase_number in self.selected_phases]
+
+ total_original = 0.0
+ total_polars = 0.0
+ total_saved = 0.0
+
+ for phase in phases_to_display:
+ polars_phase = polars_by_name.get(phase.name)
+ if polars_phase:
+ if polars_phase.duration_seconds > 0:
+ speedup = phase.duration_seconds / polars_phase.duration_seconds
+ else:
+ speedup = float("inf") if phase.duration_seconds > 0 else 1.0
+
+ saved = phase.duration_seconds - polars_phase.duration_seconds
+ speedup_str = f"{speedup:.1f}x" if speedup != float("inf") else "ā"
+
+ lines.append(
+ f"{phase.phase_number:<3} {phase.name:<26} {phase.duration_seconds:>9.4f}s {polars_phase.duration_seconds:>9.4f}s {speedup_str:>9} {saved:>10.4f}s {phase.output_count:>10,} {polars_phase.output_count:>10,}"
+ )
+
+ total_original += phase.duration_seconds
+ total_polars += polars_phase.duration_seconds
+ total_saved += saved
+ else:
+ lines.append(
+ f"{phase.phase_number:<3} {phase.name:<26} {phase.duration_seconds:>9.4f}s {'N/A':>11} {'N/A':>9} {'N/A':>12} {phase.output_count:>10,} {'N/A':>10}"
+ )
+
+ lines.append("-" * 100)
+ overall_speedup = total_original / total_polars if total_polars > 0 else 0
+ lines.append(
+ f"{'':3} {'TOTAL TRANSFORM TIME':<26} {total_original:>9.4f}s {total_polars:>9.4f}s {overall_speedup:>8.1f}x {total_saved:>10.4f}s"
+ )
+ lines.append("")
+
+ # Overall summary
+ lines.append("-" * 100)
+ lines.append("PERFORMANCE SUMMARY")
+ lines.append("-" * 100)
+ lines.append(f"Original Pipeline: {total_original:.4f}s")
+ lines.append(f"Polars Pipeline: {total_polars:.4f}s")
+ lines.append(f"Speedup Factor: {overall_speedup:.1f}x faster")
+ lines.append(
+ f"Time Saved: {total_saved:.4f}s ({(total_saved/total_original*100):.1f}% reduction)"
+ )
+ lines.append("")
+
+ else:
+ lines.append("-" * 100)
+ lines.append("ORIGINAL PIPELINE - PHASE TIMING (Row-by-Row)")
+ lines.append("-" * 100)
+
+ # Show phase selection info if applicable
+ if self.selected_phases:
+ lines.append(f"Running selected phases: {sorted(self.selected_phases)}")
+ lines.append("")
+
+ lines.append(
+ f"{'#':<4} {'Phase Name':<30} {'Duration':>12} {'% of Transform':>14} {'Output':>10}"
+ )
+ lines.append("-" * 74)
+
+ # Filter phases if selection is active
+ phases_to_display = self.phases
+ if self.selected_phases:
+ phases_to_display = [p for p in self.phases if p.phase_number in self.selected_phases]
+
+ for phase in phases_to_display:
+ pct = (
+ (phase.duration_seconds / self.transform_duration_seconds * 100)
+ if self.transform_duration_seconds > 0
+ else 0
+ )
+ lines.append(
+ f"{phase.phase_number:<4} {phase.name:<30} {phase.duration_seconds:>10.4f}s {pct:>12.1f}% {phase.output_count:>10,}"
+ )
+
+ lines.append("-" * 74)
+ lines.append(
+ f"{'':4} {'TOTAL TRANSFORM TIME':<30} {self.transform_duration_seconds:>10.4f}s {100.0:>12.1f}%"
+ )
+ lines.append("")
+
+ # Top 5 slowest phases (Original)
+ lines.append("-" * 100)
+ lines.append("TOP 5 SLOWEST PHASES (Original Pipeline)")
+ lines.append("-" * 100)
+
+ # Filter phases for "top slowest" if selection is active
+ phases_for_top5 = self.phases
+ if self.selected_phases:
+ phases_for_top5 = [p for p in self.phases if p.phase_number in self.selected_phases]
+
+ sorted_phases = sorted(
+ phases_for_top5, key=lambda x: x.duration_seconds, reverse=True
+ )[:5]
+ for i, phase in enumerate(sorted_phases, 1):
+ pct = (
+ (phase.duration_seconds / self.transform_duration_seconds * 100)
+ if self.transform_duration_seconds > 0
+ else 0
+ )
+ lines.append(
+ f" {i}. {phase.name:<30} {phase.duration_seconds:>10.4f}s ({pct:.1f}%)"
+ )
+ lines.append("")
+
+ # TOP SPEEDUP WINNERS (if Polars was run)
+ if self.polars_phases:
+ lines.append("-" * 100)
+ lines.append("TOP 5 SPEEDUP WINNERS (Biggest Improvements with Polars)")
+ lines.append("-" * 100)
+
+ # Filter phases for speedup calculation if selection is active
+ phases_for_speedup = self.phases
+ if self.selected_phases:
+ phases_for_speedup = [p for p in self.phases if p.phase_number in self.selected_phases]
+
+ polars_by_name = {p.name: p for p in self.polars_phases}
+ speedups = []
+ for phase in phases_for_speedup:
+ polars_phase = polars_by_name.get(phase.name)
+ if polars_phase and phase.duration_seconds > 0.0001:
+ if polars_phase.duration_seconds > 0:
+ speedup = phase.duration_seconds / polars_phase.duration_seconds
+ else:
+ speedup = float("inf")
+ saved = phase.duration_seconds - polars_phase.duration_seconds
+ speedups.append(
+ (
+ phase.name,
+ phase.duration_seconds,
+ polars_phase.duration_seconds,
+ speedup,
+ saved,
+ )
+ )
+
+ speedups.sort(key=lambda x: x[4], reverse=True)
+
+ for i, (name, orig, polars, spd, saved) in enumerate(speedups[:5], 1):
+ spd_str = f"{spd:.1f}x" if spd != float("inf") else "ā"
+ lines.append(
+ f" {i}. {name:<26} {orig:.4f}s ā {polars:.4f}s ({spd_str} faster, {saved:.4f}s saved)"
+ )
+ lines.append("")
+
+ # THROUGHPUT METRICS
+ if (
+ self.polars_phases
+ and self.input_records > 0
+ and self.transform_duration_seconds > 0
+ and self.polars_transform_seconds > 0
+ ):
+ lines.append("-" * 100)
+ lines.append("THROUGHPUT METRICS")
+ lines.append("-" * 100)
+ orig_throughput = self.input_records / self.transform_duration_seconds
+ polars_throughput = self.input_records / self.polars_transform_seconds
+ lines.append(f"Original Pipeline: {orig_throughput:,.0f} records/second")
+ lines.append(f"Polars Pipeline: {polars_throughput:,.0f} records/second")
+ lines.append(
+ f"Throughput Gain: {polars_throughput - orig_throughput:,.0f} records/second faster"
+ )
+ lines.append("")
+
+ lines.append("=" * 100)
+ lines.append(
+ f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+ )
+ lines.append("=" * 100)
+
+ return "\n".join(lines)
+
+ def save_json(self, path: Path):
+ """Save report as JSON file."""
+ import json
+
+ path.parent.mkdir(parents=True, exist_ok=True)
+ with open(path, 'w') as f:
+ json.dump(self.to_dict(), f, indent=2)
+
+ def save_text(self, path: Path):
+ """Save report as text file."""
+ path.parent.mkdir(parents=True, exist_ok=True)
+ with open(path, 'w') as f:
+ f.write(self.generate_text_report())
diff --git a/local_testing/pipeline_runner.py b/local_testing/pipeline_runner.py
new file mode 100644
index 00000000..3f69ccfd
--- /dev/null
+++ b/local_testing/pipeline_runner.py
@@ -0,0 +1,444 @@
+"""
+Pipeline execution engine for title-boundary dataset.
+
+Handles running the full 26-phase digital-land transformation pipeline
+with detailed timing and progress tracking.
+"""
+
+import time
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, Optional
+
+from pipeline_config import PipelineConfig
+
+
+class PipelineRunner:
+ """Executes the digital-land transformation pipeline with timing."""
+
+ def __init__(self, dataset: str = "title-boundary"):
+ """
+ Initialize pipeline runner.
+
+ Args:
+ dataset: Name of the dataset being processed
+ """
+ self.dataset = dataset
+ self.pipeline_imports = None
+
+ def get_pipeline_imports(self):
+ """
+ Lazy import of digital-land pipeline modules.
+
+ Returns dict of imported classes and functions.
+ """
+ if self.pipeline_imports is not None:
+ return self.pipeline_imports
+
+ from digital_land.phase.convert import ConvertPhase
+ from digital_land.phase.normalise import NormalisePhase
+ from digital_land.phase.parse import ParsePhase
+ from digital_land.phase.concat import ConcatFieldPhase
+ from digital_land.phase.filter import FilterPhase
+ from digital_land.phase.map import MapPhase
+ from digital_land.phase.patch import PatchPhase
+ from digital_land.phase.harmonise import HarmonisePhase
+ from digital_land.phase.default import DefaultPhase
+ from digital_land.phase.migrate import MigratePhase
+ from digital_land.phase.organisation import OrganisationPhase
+ from digital_land.phase.prune import (
+ FieldPrunePhase,
+ EntityPrunePhase,
+ FactPrunePhase,
+ )
+ from digital_land.phase.reference import (
+ EntityReferencePhase,
+ FactReferencePhase,
+ )
+ from digital_land.phase.prefix import EntityPrefixPhase
+ from digital_land.phase.lookup import EntityLookupPhase, FactLookupPhase
+ from digital_land.phase.priority import PriorityPhase
+ from digital_land.phase.pivot import PivotPhase
+ from digital_land.phase.combine import FactCombinePhase
+ from digital_land.phase.factor import FactorPhase
+ from digital_land.phase.save import SavePhase
+ from digital_land.pipeline.main import Pipeline
+ from digital_land.specification import Specification
+ from digital_land.organisation import Organisation
+ from digital_land.log import (
+ IssueLog,
+ ColumnFieldLog,
+ DatasetResourceLog,
+ OperationalIssueLog,
+ ConvertedResourceLog,
+ )
+ from digital_land.api import API
+
+ self.pipeline_imports = {
+ "ConvertPhase": ConvertPhase,
+ "NormalisePhase": NormalisePhase,
+ "ParsePhase": ParsePhase,
+ "ConcatFieldPhase": ConcatFieldPhase,
+ "FilterPhase": FilterPhase,
+ "MapPhase": MapPhase,
+ "PatchPhase": PatchPhase,
+ "HarmonisePhase": HarmonisePhase,
+ "DefaultPhase": DefaultPhase,
+ "MigratePhase": MigratePhase,
+ "OrganisationPhase": OrganisationPhase,
+ "FieldPrunePhase": FieldPrunePhase,
+ "EntityPrunePhase": EntityPrunePhase,
+ "FactPrunePhase": FactPrunePhase,
+ "EntityReferencePhase": EntityReferencePhase,
+ "FactReferencePhase": FactReferencePhase,
+ "EntityPrefixPhase": EntityPrefixPhase,
+ "EntityLookupPhase": EntityLookupPhase,
+ "FactLookupPhase": FactLookupPhase,
+ "PriorityPhase": PriorityPhase,
+ "PivotPhase": PivotPhase,
+ "FactCombinePhase": FactCombinePhase,
+ "FactorPhase": FactorPhase,
+ "SavePhase": SavePhase,
+ "Pipeline": Pipeline,
+ "Specification": Specification,
+ "Organisation": Organisation,
+ "IssueLog": IssueLog,
+ "ColumnFieldLog": ColumnFieldLog,
+ "DatasetResourceLog": DatasetResourceLog,
+ "OperationalIssueLog": OperationalIssueLog,
+ "ConvertedResourceLog": ConvertedResourceLog,
+ "API": API,
+ }
+
+ return self.pipeline_imports
+
+ def run_full_pipeline(
+ self,
+ input_csv: Path,
+ output_dir: Path,
+ specification_dir: Path,
+ pipeline_dir: Path,
+ cache_dir: Path,
+ la_name: str,
+ report=None,
+ selected_phases=None,
+ ) -> Dict:
+ """
+ Run the full 26-phase digital-land transformation pipeline.
+
+ Args:
+ input_csv: Path to input CSV/Parquet file
+ output_dir: Directory for output files
+ specification_dir: Directory containing specification files
+ pipeline_dir: Directory containing pipeline configuration
+ cache_dir: Directory for cached resources
+ la_name: Local Authority name/slug
+ report: Optional PipelineReport instance for metrics tracking
+ selected_phases: Optional set of phase numbers (1-26) to run
+
+ Returns:
+ Dict with results including file paths and record counts
+ """
+ print(" Loading digital-land pipeline modules...")
+ p = self.get_pipeline_imports()
+
+ # Convert Parquet to CSV if needed (original pipeline only supports CSV)
+ if input_csv.suffix.lower() == ".parquet":
+ import polars as pl
+
+ csv_input = input_csv.with_suffix(".csv")
+ if not csv_input.exists():
+ print(f" Converting Parquet to CSV for original pipeline...")
+ pl.read_parquet(input_csv).write_csv(csv_input)
+ input_csv = csv_input
+
+ # Set up output paths
+ harmonised_csv = output_dir / f"{la_name}_harmonised.csv"
+ facts_csv = output_dir / f"{la_name}_facts.csv"
+ issue_csv = output_dir / f"{la_name}_issues.csv"
+
+ print(f" Input: {input_csv}")
+ print(f" Harmonised: {harmonised_csv}")
+ print(f" Facts: {facts_csv}")
+
+ # Load configuration
+ specification = p["Specification"](str(specification_dir))
+ pipeline = p["Pipeline"](
+ str(pipeline_dir), self.dataset, specification=specification
+ )
+ schema = specification.pipeline.get(pipeline.name, {}).get(
+ "schema", self.dataset
+ )
+ intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
+ factor_fieldnames = specification.factor_fieldnames()
+
+ # Create logs
+ resource = la_name.lower().replace(" ", "_")
+ issue_log = p["IssueLog"](dataset=self.dataset, resource=resource)
+ operational_issue_log = p["OperationalIssueLog"](
+ dataset=self.dataset, resource=resource
+ )
+ column_field_log = p["ColumnFieldLog"](dataset=self.dataset, resource=resource)
+ dataset_resource_log = p["DatasetResourceLog"](
+ dataset=self.dataset, resource=resource
+ )
+ converted_resource_log = p["ConvertedResourceLog"](
+ dataset=self.dataset, resource=resource
+ )
+
+ # Load organization data
+ org_csv = PipelineConfig.download_organisation_csv(cache_dir)
+ organisation = p["Organisation"](
+ organisation_path=str(org_csv), pipeline_dir=Path(pipeline_dir)
+ )
+ api = p["API"](specification=specification)
+
+ # Get configuration
+ entity_range_min = specification.get_dataset_entity_min(self.dataset)
+ entity_range_max = specification.get_dataset_entity_max(self.dataset)
+ endpoints = []
+ organisations_list = ["government-organisation:D2"]
+ entry_date = datetime.now().strftime("%Y-%m-%d")
+
+ # Get pipeline configuration
+ skip_patterns = pipeline.skip_patterns(resource, endpoints)
+ columns = pipeline.columns(resource, endpoints=endpoints)
+ concats = pipeline.concatenations(resource, endpoints=endpoints)
+ patches = pipeline.patches(resource=resource, endpoints=endpoints)
+ lookups = pipeline.lookups(resource=resource)
+ default_fields = pipeline.default_fields(resource=resource, endpoints=endpoints)
+ default_values = pipeline.default_values(endpoints=endpoints)
+ combine_fields = pipeline.combine_fields(endpoints=endpoints)
+ redirect_lookups = pipeline.redirect_lookups()
+ migrations = pipeline.migrations()
+ config = None
+ valid_category_values = api.get_valid_category_values(self.dataset, pipeline)
+
+ if len(organisations_list) == 1:
+ default_values["organisation"] = organisations_list[0]
+ if entry_date and "entry-date" not in default_values:
+ default_values["entry-date"] = entry_date
+
+ field_datatype_map = specification.get_field_datatype_map()
+ field_typology_map = specification.get_field_typology_map()
+ field_prefix_map = specification.get_field_prefix_map()
+ dataset_prefix = specification.dataset_prefix(self.dataset)
+
+ print(" Running 26-phase pipeline with per-phase timing...")
+
+ # Define phase creators
+ phase_creators = [
+ (
+ 1,
+ "ConvertPhase",
+ lambda: p["ConvertPhase"](
+ path=str(input_csv),
+ dataset_resource_log=dataset_resource_log,
+ converted_resource_log=converted_resource_log,
+ ),
+ ),
+ (
+ 2,
+ "NormalisePhase",
+ lambda: p["NormalisePhase"](skip_patterns=skip_patterns),
+ ),
+ (3, "ParsePhase", lambda: p["ParsePhase"]()),
+ (
+ 4,
+ "ConcatFieldPhase",
+ lambda: p["ConcatFieldPhase"](concats=concats, log=column_field_log),
+ ),
+ (
+ 5,
+ "FilterPhase-1",
+ lambda: p["FilterPhase"](filters=pipeline.filters(resource)),
+ ),
+ (
+ 6,
+ "MapPhase",
+ lambda: p["MapPhase"](
+ fieldnames=intermediate_fieldnames,
+ columns=columns,
+ log=column_field_log,
+ ),
+ ),
+ (
+ 7,
+ "FilterPhase-2",
+ lambda: p["FilterPhase"](
+ filters=pipeline.filters(resource, endpoints=endpoints)
+ ),
+ ),
+ (
+ 8,
+ "PatchPhase",
+ lambda: p["PatchPhase"](issues=issue_log, patches=patches),
+ ),
+ (
+ 9,
+ "HarmonisePhase",
+ lambda: p["HarmonisePhase"](
+ field_datatype_map=field_datatype_map,
+ issues=issue_log,
+ dataset=self.dataset,
+ valid_category_values=valid_category_values,
+ ),
+ ),
+ (
+ 10,
+ "DefaultPhase",
+ lambda: p["DefaultPhase"](
+ default_fields=default_fields,
+ default_values=default_values,
+ issues=issue_log,
+ ),
+ ),
+ (
+ 11,
+ "MigratePhase",
+ lambda: p["MigratePhase"](
+ fields=specification.schema_field[schema], migrations=migrations
+ ),
+ ),
+ (
+ 12,
+ "OrganisationPhase",
+ lambda: p["OrganisationPhase"](
+ organisation=organisation, issues=issue_log
+ ),
+ ),
+ (
+ 13,
+ "FieldPrunePhase",
+ lambda: p["FieldPrunePhase"](
+ fields=specification.current_fieldnames(schema)
+ ),
+ ),
+ (
+ 14,
+ "EntityReferencePhase",
+ lambda: p["EntityReferencePhase"](
+ dataset=self.dataset, prefix=dataset_prefix, issues=issue_log
+ ),
+ ),
+ (
+ 15,
+ "EntityPrefixPhase",
+ lambda: p["EntityPrefixPhase"](dataset=self.dataset),
+ ),
+ (
+ 16,
+ "EntityLookupPhase",
+ lambda: p["EntityLookupPhase"](
+ lookups=lookups,
+ redirect_lookups=redirect_lookups,
+ issue_log=issue_log,
+ operational_issue_log=operational_issue_log,
+ entity_range=[entity_range_min, entity_range_max],
+ ),
+ ),
+ (
+ 17,
+ "SavePhase-harmonised",
+ lambda: p["SavePhase"](
+ str(harmonised_csv),
+ fieldnames=intermediate_fieldnames,
+ enabled=True,
+ ),
+ ),
+ (
+ 18,
+ "EntityPrunePhase",
+ lambda: p["EntityPrunePhase"](
+ dataset_resource_log=dataset_resource_log
+ ),
+ ),
+ (
+ 19,
+ "PriorityPhase",
+ lambda: p["PriorityPhase"](config=config, providers=organisations_list),
+ ),
+ (20, "PivotPhase", lambda: p["PivotPhase"]()),
+ (
+ 21,
+ "FactCombinePhase",
+ lambda: p["FactCombinePhase"](
+ issue_log=issue_log, fields=combine_fields
+ ),
+ ),
+ (22, "FactorPhase", lambda: p["FactorPhase"]()),
+ (
+ 23,
+ "FactReferencePhase",
+ lambda: p["FactReferencePhase"](
+ field_typology_map=field_typology_map,
+ field_prefix_map=field_prefix_map,
+ ),
+ ),
+ (
+ 24,
+ "FactLookupPhase",
+ lambda: p["FactLookupPhase"](
+ lookups=lookups,
+ redirect_lookups=redirect_lookups,
+ issue_log=issue_log,
+ odp_collections=specification.get_odp_collections(),
+ ),
+ ),
+ (25, "FactPrunePhase", lambda: p["FactPrunePhase"]()),
+ (
+ 26,
+ "SavePhase-facts",
+ lambda: p["SavePhase"](str(facts_csv), fieldnames=factor_fieldnames),
+ ),
+ ]
+
+ # Run phases with timing
+ stream_data = []
+ total_start = time.time()
+
+ for phase_num, phase_name, phase_creator in phase_creators:
+ phase = phase_creator()
+ phase_start = time.time()
+
+ if phase_num == 1:
+ output_stream = phase.process(iter([]))
+ else:
+ output_stream = phase.process(iter(stream_data))
+
+ stream_data = list(output_stream)
+ duration = time.time() - phase_start
+ output_count = len(stream_data)
+
+ if report:
+ metrics = report.add_phase(phase_name, phase_num)
+ metrics.duration_seconds = duration
+ metrics.output_count = output_count
+
+ if duration > 0.1:
+ print(
+ f" Phase {phase_num:2d}: {phase_name:<25} {duration:8.4f}s ({output_count:,} rows)"
+ )
+
+ total_transform_time = time.time() - total_start
+ print(f" Total transform time: {total_transform_time:.3f}s")
+
+ # Count results
+ harmonised_count = (
+ sum(1 for _ in open(harmonised_csv)) - 1 if harmonised_csv.exists() else 0
+ )
+ facts_count = sum(1 for _ in open(facts_csv)) - 1 if facts_csv.exists() else 0
+ issue_log.save(str(issue_csv))
+
+ if report:
+ report.harmonised_records = harmonised_count
+ report.fact_records = facts_count
+
+ return {
+ "harmonised": harmonised_count,
+ "facts": facts_count,
+ "harmonised_path": str(harmonised_csv),
+ "facts_path": str(facts_csv),
+ "issues_path": str(issue_csv),
+ "transform_time": total_transform_time,
+ }
From ecd0b4f9ab8921a5a8397a2ef6ae1a71a4a1e43d Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 9 Feb 2026 13:52:58 +0000
Subject: [PATCH 12/15] =?UTF-8?q?refactor:=20improve=20code=20formatting?=
=?UTF-8?q?=20and=20readability=20across=20multiple=20files=20Rapid=20loca?=
=?UTF-8?q?l=20performance=20test=20environment=20supporting=20the=20Polar?=
=?UTF-8?q?s=E2=80=91based=20transformation=20rewrite=20in=20digital-land-?=
=?UTF-8?q?python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
local_testing/file_downloader.py | 112 +++++++----
local_testing/gml_converter.py | 332 ++++++++++++++++++-------------
local_testing/gml_extractor.py | 26 +--
local_testing/main.py | 29 +--
local_testing/pipeline_report.py | 48 +++--
local_testing/run_all.py | 14 +-
6 files changed, 326 insertions(+), 235 deletions(-)
diff --git a/local_testing/file_downloader.py b/local_testing/file_downloader.py
index e13fc40b..d3bcb4ea 100644
--- a/local_testing/file_downloader.py
+++ b/local_testing/file_downloader.py
@@ -12,6 +12,7 @@
try:
import requests
+
HAS_REQUESTS = True
except ImportError:
HAS_REQUESTS = False
@@ -32,23 +33,27 @@ def fetch_endpoint_list(self) -> List[dict]:
req = urllib.request.Request(
self.endpoint_csv_url,
- headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'}
+ headers={
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
+ },
)
-
+
with urllib.request.urlopen(req) as response:
- content = response.read().decode('utf-8')
+ content = response.read().decode("utf-8")
reader = csv.DictReader(content.splitlines())
-
+
endpoints = []
for row in reader:
- url = row.get('endpoint-url', '').strip()
+ url = row.get("endpoint-url", "").strip()
if url:
- endpoints.append({
- 'endpoint': row.get('endpoint', ''),
- 'url': url,
- 'local_authority': self.get_la_name_from_url(url),
- 'entry_date': row.get('entry-date', ''),
- })
+ endpoints.append(
+ {
+ "endpoint": row.get("endpoint", ""),
+ "url": url,
+ "local_authority": self.get_la_name_from_url(url),
+ "entry_date": row.get("entry-date", ""),
+ }
+ )
print(f" Found {len(endpoints)} endpoints")
return endpoints
@@ -61,16 +66,27 @@ def get_la_name_from_url(url: str) -> str:
if parts:
filename = parts[-1].replace(".zip", "").replace("_", " ")
# Remove common suffixes for cleaner names
- for suffix in [" Council", " Borough Council", " City Council", " District Council",
- " Metropolitan Borough Council", " County Council"]:
+ for suffix in [
+ " Council",
+ " Borough Council",
+ " City Council",
+ " District Council",
+ " Metropolitan Borough Council",
+ " County Council",
+ ]:
if filename.endswith(suffix):
- filename = filename[:-len(suffix)]
+ filename = filename[: -len(suffix)]
break
# Remove prefixes
- for prefix in ["Borough of ", "City of ", "County of ", "Royal Borough of ",
- "London Borough of "]:
+ for prefix in [
+ "Borough of ",
+ "City of ",
+ "County of ",
+ "Royal Borough of ",
+ "London Borough of ",
+ ]:
if filename.startswith(prefix):
- filename = filename[len(prefix):]
+ filename = filename[len(prefix) :]
break
return filename.strip()
return "Unknown"
@@ -99,58 +115,66 @@ def download_file(
return self._download_with_requests(url, output_path, chunk_size)
else:
return self._download_with_urllib(url, output_path, chunk_size)
-
- def _download_with_requests(self, url: str, output_path: Path, chunk_size: int) -> Path:
+
+ def _download_with_requests(
+ self, url: str, output_path: Path, chunk_size: int
+ ) -> Path:
"""Download using requests library (handles redirects better)."""
headers = {
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language': 'en-GB,en;q=0.9',
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+ "Accept-Language": "en-GB,en;q=0.9",
}
-
+
session = requests.Session()
session.headers.update(headers)
-
+
response = session.get(url, stream=True, allow_redirects=True, timeout=30)
response.raise_for_status()
-
- total_size = int(response.headers.get('content-length', 0))
+
+ total_size = int(response.headers.get("content-length", 0))
downloaded = 0
-
- with open(output_path, 'wb') as f:
+
+ with open(output_path, "wb") as f:
for chunk in response.iter_content(chunk_size=chunk_size):
if chunk:
f.write(chunk)
downloaded += len(chunk)
-
+
if total_size > 0:
progress = (downloaded / total_size) * 100
mb_downloaded = downloaded / (1024 * 1024)
mb_total = total_size / (1024 * 1024)
- print(f"\r Progress: {progress:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)", end="", flush=True)
-
+ print(
+ f"\r Progress: {progress:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)",
+ end="",
+ flush=True,
+ )
+
print() # New line after progress
print(f" ā Downloaded {downloaded:,} bytes")
return output_path
-
- def _download_with_urllib(self, url: str, output_path: Path, chunk_size: int) -> Path:
+
+ def _download_with_urllib(
+ self, url: str, output_path: Path, chunk_size: int
+ ) -> Path:
"""Download using urllib (fallback)."""
# Add comprehensive browser headers to mimic real browser
headers = {
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
- 'Accept-Language': 'en-GB,en;q=0.9',
- 'Accept-Encoding': 'gzip, deflate, br',
- 'Connection': 'keep-alive',
- 'Upgrade-Insecure-Requests': '1',
- 'Sec-Fetch-Dest': 'document',
- 'Sec-Fetch-Mode': 'navigate',
- 'Sec-Fetch-Site': 'none',
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+ "Accept-Language": "en-GB,en;q=0.9",
+ "Accept-Encoding": "gzip, deflate, br",
+ "Connection": "keep-alive",
+ "Upgrade-Insecure-Requests": "1",
+ "Sec-Fetch-Dest": "document",
+ "Sec-Fetch-Mode": "navigate",
+ "Sec-Fetch-Site": "none",
}
-
+
req = urllib.request.Request(url, headers=headers)
-
+
with urllib.request.urlopen(req) as response:
total_size = int(response.headers.get("content-length", 0))
downloaded = 0
diff --git a/local_testing/gml_converter.py b/local_testing/gml_converter.py
index 4a3b4036..1ece02f1 100644
--- a/local_testing/gml_converter.py
+++ b/local_testing/gml_converter.py
@@ -15,44 +15,48 @@
class GMLConverter:
"""Converts GML files to CSV/Parquet with multiple strategies."""
-
+
@staticmethod
def extract_polygon_wkt(geometry_text: str) -> str:
"""
Extract polygon coordinates and convert to WKT format.
-
+
Handles both exterior rings and interior rings (holes).
-
+
Args:
geometry_text: GML geometry element text
-
+
Returns:
WKT polygon string, or empty string if no valid geometry
"""
exterior_match = re.search(
- r'.*?([^<]+).*?',
- geometry_text, re.DOTALL
+ r".*?([^<]+).*?",
+ geometry_text,
+ re.DOTALL,
)
-
+
if not exterior_match:
return ""
-
+
exterior_coords_raw = exterior_match.group(1).strip().split()
exterior_coords = []
for i in range(0, len(exterior_coords_raw), 2):
if i + 1 < len(exterior_coords_raw):
- exterior_coords.append(f"{exterior_coords_raw[i]} {exterior_coords_raw[i+1]}")
-
+ exterior_coords.append(
+ f"{exterior_coords_raw[i]} {exterior_coords_raw[i+1]}"
+ )
+
if not exterior_coords:
return ""
-
+
# Extract interior rings (holes)
interior_rings = []
interior_matches = re.findall(
- r'.*?([^<]+).*?',
- geometry_text, re.DOTALL
+ r".*?([^<]+).*?",
+ geometry_text,
+ re.DOTALL,
)
-
+
for interior_coords_raw in interior_matches:
coords = interior_coords_raw.strip().split()
ring_coords = []
@@ -61,127 +65,149 @@ def extract_polygon_wkt(geometry_text: str) -> str:
ring_coords.append(f"{coords[i]} {coords[i+1]}")
if ring_coords:
interior_rings.append(ring_coords)
-
+
exterior_wkt = f"({', '.join(exterior_coords)})"
if interior_rings:
interior_wkts = [f"({', '.join(ring)})" for ring in interior_rings]
return f"POLYGON({exterior_wkt}, {', '.join(interior_wkts)})"
return f"POLYGON({exterior_wkt})"
-
+
@staticmethod
def extract_field(text: str, field_name: str) -> str:
"""
Extract a field value from GML text.
-
+
Args:
text: GML text to search
field_name: Field name to extract
-
+
Returns:
Field value, or empty string if not found
"""
- pattern = f'([^<]+)'
+ pattern = f"([^<]+)"
match = re.search(pattern, text)
return match.group(1) if match else ""
-
- def convert_to_csv(self, gml_path: Path, csv_path: Path, limit: Optional[int] = None) -> int:
+
+ def convert_to_csv(
+ self, gml_path: Path, csv_path: Path, limit: Optional[int] = None
+ ) -> int:
"""
Convert GML file to CSV format using regex parsing.
-
+
This is the baseline method - slower but doesn't require external dependencies.
-
+
Args:
gml_path: Path to input GML file
csv_path: Path to output CSV file
limit: Optional limit on number of records to convert
-
+
Returns:
Number of records converted
"""
print(f" Converting GML to CSV...")
print(f" Input: {gml_path}")
print(f" Output: {csv_path}")
-
+
size_mb = gml_path.stat().st_size / (1024 * 1024)
print(f" GML size: {size_mb:.1f} MB")
-
- with open(gml_path, 'r', encoding='utf-8') as f:
+
+ with open(gml_path, "r", encoding="utf-8") as f:
content = f.read()
-
+
# Find all cadastral parcel elements
- pattern = r']*>(.*?)'
+ pattern = r"]*>(.*?)"
matches = re.findall(pattern, content, re.DOTALL)
total_features = len(matches)
print(f" Found {total_features} cadastral parcels")
-
+
if limit:
print(f" Limiting to {limit} records")
-
+
fieldnames = [
- 'reference', 'name', 'national-cadastral-reference', 'geometry',
- 'start-date', 'entry-date', 'end-date', 'prefix', 'organisation', 'notes'
+ "reference",
+ "name",
+ "national-cadastral-reference",
+ "geometry",
+ "start-date",
+ "entry-date",
+ "end-date",
+ "prefix",
+ "organisation",
+ "notes",
]
-
+
csv_path.parent.mkdir(parents=True, exist_ok=True)
count = 0
-
- with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction='ignore')
+
+ with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
+ writer = csv.DictWriter(
+ csvfile, fieldnames=fieldnames, extrasaction="ignore"
+ )
writer.writeheader()
-
+
for match in matches:
feature = {}
-
- inspire_id = self.extract_field(match, 'INSPIREID')
+
+ inspire_id = self.extract_field(match, "INSPIREID")
if inspire_id:
- feature['reference'] = inspire_id
- feature['name'] = inspire_id
-
- ncr = self.extract_field(match, 'NATIONALCADASTRALREFERENCE')
+ feature["reference"] = inspire_id
+ feature["name"] = inspire_id
+
+ ncr = self.extract_field(match, "NATIONALCADASTRALREFERENCE")
if ncr:
- feature['national-cadastral-reference'] = ncr
-
- valid_from = self.extract_field(match, 'VALIDFROM')
+ feature["national-cadastral-reference"] = ncr
+
+ valid_from = self.extract_field(match, "VALIDFROM")
if valid_from:
- feature['start-date'] = valid_from.split('T')[0] if 'T' in valid_from else valid_from
-
- begin_lifespan = self.extract_field(match, 'BEGINLIFESPANVERSION')
+ feature["start-date"] = (
+ valid_from.split("T")[0] if "T" in valid_from else valid_from
+ )
+
+ begin_lifespan = self.extract_field(match, "BEGINLIFESPANVERSION")
if begin_lifespan:
- feature['entry-date'] = begin_lifespan.split('T')[0] if 'T' in begin_lifespan else begin_lifespan
-
- geometry_match = re.search(r'(.*?)', match, re.DOTALL)
+ feature["entry-date"] = (
+ begin_lifespan.split("T")[0]
+ if "T" in begin_lifespan
+ else begin_lifespan
+ )
+
+ geometry_match = re.search(
+ r"(.*?)", match, re.DOTALL
+ )
if geometry_match:
wkt = self.extract_polygon_wkt(geometry_match.group(1))
if wkt:
- feature['geometry'] = wkt
-
- if 'reference' in feature:
- feature['prefix'] = 'title-boundary'
- feature['organisation'] = 'government-organisation:D2'
+ feature["geometry"] = wkt
+
+ if "reference" in feature:
+ feature["prefix"] = "title-boundary"
+ feature["organisation"] = "government-organisation:D2"
writer.writerow(feature)
count += 1
-
+
if count % 5000 == 0:
print(f" Converted {count}/{total_features} features...")
-
+
if limit and count >= limit:
break
-
+
print(f" Converted {count} records to CSV")
return count
-
- def convert_to_parquet(self, gml_path: Path, parquet_path: Path, limit: Optional[int] = None) -> int:
+
+ def convert_to_parquet(
+ self, gml_path: Path, parquet_path: Path, limit: Optional[int] = None
+ ) -> int:
"""
Convert GML file to Parquet format using regex parsing + Polars.
-
+
Parquet is faster to read than CSV and preserves data types.
Falls back to CSV if Polars is not installed.
-
+
Args:
gml_path: Path to input GML file
parquet_path: Path to output Parquet file
limit: Optional limit on number of records to convert
-
+
Returns:
Number of records converted
"""
@@ -190,86 +216,96 @@ def convert_to_parquet(self, gml_path: Path, parquet_path: Path, limit: Optional
except ImportError:
print(" Polars not installed. Install with: pip install polars")
print(" Falling back to CSV...")
- csv_path = parquet_path.with_suffix('.csv')
+ csv_path = parquet_path.with_suffix(".csv")
return self.convert_to_csv(gml_path, csv_path, limit)
-
+
print(f" Converting GML to Parquet...")
print(f" Input: {gml_path}")
print(f" Output: {parquet_path}")
-
+
size_mb = gml_path.stat().st_size / (1024 * 1024)
print(f" GML size: {size_mb:.1f} MB")
-
- with open(gml_path, 'r', encoding='utf-8') as f:
+
+ with open(gml_path, "r", encoding="utf-8") as f:
content = f.read()
-
+
# Find all cadastral parcel elements
- pattern = r']*>(.*?)'
+ pattern = r"]*>(.*?)"
matches = re.findall(pattern, content, re.DOTALL)
total_features = len(matches)
print(f" Found {total_features} cadastral parcels")
-
+
if limit:
print(f" Limiting to {limit} records")
matches = matches[:limit]
-
+
# Build list of records
records = []
for match in matches:
feature = {}
-
- inspire_id = self.extract_field(match, 'INSPIREID')
+
+ inspire_id = self.extract_field(match, "INSPIREID")
if inspire_id:
- feature['reference'] = inspire_id
- feature['name'] = inspire_id
-
- ncr = self.extract_field(match, 'NATIONALCADASTRALREFERENCE')
+ feature["reference"] = inspire_id
+ feature["name"] = inspire_id
+
+ ncr = self.extract_field(match, "NATIONALCADASTRALREFERENCE")
if ncr:
- feature['national-cadastral-reference'] = ncr
-
- valid_from = self.extract_field(match, 'VALIDFROM')
+ feature["national-cadastral-reference"] = ncr
+
+ valid_from = self.extract_field(match, "VALIDFROM")
if valid_from:
- feature['start-date'] = valid_from.split('T')[0] if 'T' in valid_from else valid_from
-
- begin_lifespan = self.extract_field(match, 'BEGINLIFESPANVERSION')
+ feature["start-date"] = (
+ valid_from.split("T")[0] if "T" in valid_from else valid_from
+ )
+
+ begin_lifespan = self.extract_field(match, "BEGINLIFESPANVERSION")
if begin_lifespan:
- feature['entry-date'] = begin_lifespan.split('T')[0] if 'T' in begin_lifespan else begin_lifespan
-
- geometry_match = re.search(r'(.*?)', match, re.DOTALL)
+ feature["entry-date"] = (
+ begin_lifespan.split("T")[0]
+ if "T" in begin_lifespan
+ else begin_lifespan
+ )
+
+ geometry_match = re.search(
+ r"(.*?)", match, re.DOTALL
+ )
if geometry_match:
wkt = self.extract_polygon_wkt(geometry_match.group(1))
if wkt:
- feature['geometry'] = wkt
-
- if 'reference' in feature:
- feature['prefix'] = 'title-boundary'
- feature['organisation'] = 'government-organisation:D2'
- feature['end-date'] = None
- feature['notes'] = None
+ feature["geometry"] = wkt
+
+ if "reference" in feature:
+ feature["prefix"] = "title-boundary"
+ feature["organisation"] = "government-organisation:D2"
+ feature["end-date"] = None
+ feature["notes"] = None
records.append(feature)
-
+
# Create DataFrame and write to Parquet
parquet_path.parent.mkdir(parents=True, exist_ok=True)
-
+
df = pl.DataFrame(records)
- df.write_parquet(parquet_path, compression='snappy')
-
+ df.write_parquet(parquet_path, compression="snappy")
+
count = len(records)
print(f" Converted {count} records to Parquet")
return count
-
- def convert_to_parquet_duckdb(self, gml_path: Path, parquet_path: Path, limit: Optional[int] = None) -> int:
+
+ def convert_to_parquet_duckdb(
+ self, gml_path: Path, parquet_path: Path, limit: Optional[int] = None
+ ) -> int:
"""
Convert GML file to Parquet format using DuckDB with spatial extension.
-
+
This is the fastest method - DuckDB reads GML directly and writes Parquet.
Falls back to Polars-based converter if DuckDB is not available.
-
+
Args:
gml_path: Path to input GML file
parquet_path: Path to output Parquet file
limit: Optional limit on number of records to convert
-
+
Returns:
Number of records converted
"""
@@ -279,16 +315,16 @@ def convert_to_parquet_duckdb(self, gml_path: Path, parquet_path: Path, limit: O
print(" DuckDB not installed. Install with: pip install duckdb")
print(" Falling back to Polars-based converter...")
return self.convert_to_parquet(gml_path, parquet_path, limit)
-
+
print(f" Converting GML to Parquet using DuckDB...")
print(f" Input: {gml_path}")
print(f" Output: {parquet_path}")
-
+
size_mb = gml_path.stat().st_size / (1024 * 1024)
print(f" GML size: {size_mb:.1f} MB")
-
+
parquet_path.parent.mkdir(parents=True, exist_ok=True)
-
+
try:
con = duckdb.connect()
try:
@@ -299,10 +335,10 @@ def convert_to_parquet_duckdb(self, gml_path: Path, parquet_path: Path, limit: O
print(" Falling back to Polars-based converter...")
con.close()
return self.convert_to_parquet(gml_path, parquet_path, limit)
-
+
print(" Reading GML file...")
limit_clause = f"LIMIT {limit}" if limit else ""
-
+
query = f"""
SELECT
INSPIREID as reference,
@@ -327,48 +363,54 @@ def convert_to_parquet_duckdb(self, gml_path: Path, parquet_path: Path, limit: O
WHERE INSPIREID IS NOT NULL
{limit_clause}
"""
-
+
count_query = f"SELECT COUNT(*) FROM ST_Read('{gml_path}')"
total_count = con.execute(count_query).fetchone()[0]
print(f" Found {total_count:,} cadastral parcels")
-
+
if limit:
print(f" Limiting to {limit} records")
-
+
# Export directly to Parquet (much faster than CSV)
print(" Transforming and writing to Parquet...")
- con.execute(f"COPY ({query}) TO '{parquet_path}' (FORMAT PARQUET, COMPRESSION 'snappy')")
-
+ con.execute(
+ f"COPY ({query}) TO '{parquet_path}' (FORMAT PARQUET, COMPRESSION 'snappy')"
+ )
+
# Count output rows
- result_count = con.execute(f"SELECT COUNT(*) FROM read_parquet('{parquet_path}')").fetchone()[0]
-
+ result_count = con.execute(
+ f"SELECT COUNT(*) FROM read_parquet('{parquet_path}')"
+ ).fetchone()[0]
+
con.close()
-
+
print(f" Converted {result_count:,} records to Parquet")
return result_count
-
+
except Exception as e:
print(f" DuckDB conversion failed: {e}")
print(" Falling back to Polars-based converter...")
return self.convert_to_parquet(gml_path, parquet_path, limit)
-
- def convert_to_csv_duckdb(self, gml_path: Path, csv_path: Path, limit: Optional[int] = None) -> int:
+
+ def convert_to_csv_duckdb(
+ self, gml_path: Path, csv_path: Path, limit: Optional[int] = None
+ ) -> int:
"""
Convert GML file to CSV format using DuckDB with spatial extension.
-
+
This is significantly faster than regex parsing and properly handles:
- Coordinate transformations (OSGB EPSG:27700 to WGS84 EPSG:4326)
- Complex geometries (multi-polygons, holes)
- Large files with streaming
-
+
Note: For even better performance, use convert_to_parquet_duckdb() instead.
Falls back to regex-based converter if DuckDB is not available.
-
+
Args:
gml_path: Path to input GML file
csv_path: Path to output CSV file
limit: Optional limit on number of records to convert
-
+
Returns:
Number of records converted
"""
@@ -378,16 +420,16 @@ def convert_to_csv_duckdb(self, gml_path: Path, csv_path: Path, limit: Optional[
print(" DuckDB not installed. Install with: pip install duckdb")
print(" Falling back to regex-based converter...")
return self.convert_to_csv(gml_path, csv_path, limit)
-
+
print(f" Converting GML to CSV using DuckDB...")
print(f" Input: {gml_path}")
print(f" Output: {csv_path}")
-
+
size_mb = gml_path.stat().st_size / (1024 * 1024)
print(f" GML size: {size_mb:.1f} MB")
-
+
csv_path.parent.mkdir(parents=True, exist_ok=True)
-
+
try:
# Create DuckDB connection and load spatial extension
con = duckdb.connect()
@@ -397,16 +439,18 @@ def convert_to_csv_duckdb(self, gml_path: Path, csv_path: Path, limit: Optional[
except Exception as ext_err:
print(f" Failed to load spatial extension: {ext_err}")
print(" This may be a network issue. Try running:")
- print(" python -c \"import duckdb; duckdb.connect().execute('INSTALL spatial')\"")
+ print(
+ " python -c \"import duckdb; duckdb.connect().execute('INSTALL spatial')\""
+ )
print(" Falling back to regex-based converter...")
con.close()
return self.convert_to_csv(gml_path, csv_path, limit)
-
+
# Read GML file using ST_Read (GDAL-based)
print(" Reading GML file...")
-
+
limit_clause = f"LIMIT {limit}" if limit else ""
-
+
query = f"""
SELECT
INSPIREID as reference,
@@ -431,27 +475,29 @@ def convert_to_csv_duckdb(self, gml_path: Path, csv_path: Path, limit: Optional[
WHERE INSPIREID IS NOT NULL
{limit_clause}
"""
-
+
# Execute and get count first
count_query = f"SELECT COUNT(*) FROM ST_Read('{gml_path}')"
total_count = con.execute(count_query).fetchone()[0]
print(f" Found {total_count:,} cadastral parcels")
-
+
if limit:
print(f" Limiting to {limit} records")
-
+
# Export directly to CSV
print(" Transforming and writing to CSV...")
con.execute(f"COPY ({query}) TO '{csv_path}' (HEADER, DELIMITER ',')")
-
+
# Count output rows
- result_count = con.execute(f"SELECT COUNT(*) FROM read_csv('{csv_path}')").fetchone()[0]
-
+ result_count = con.execute(
+ f"SELECT COUNT(*) FROM read_csv('{csv_path}')"
+ ).fetchone()[0]
+
con.close()
-
+
print(f" Converted {result_count:,} records to CSV")
return result_count
-
+
except Exception as e:
print(f" DuckDB conversion failed: {e}")
print(" Falling back to regex-based converter...")
diff --git a/local_testing/gml_extractor.py b/local_testing/gml_extractor.py
index fb004301..767ed8b1 100644
--- a/local_testing/gml_extractor.py
+++ b/local_testing/gml_extractor.py
@@ -10,41 +10,41 @@
class GMLExtractor:
"""Extracts GML files from ZIP archives."""
-
+
@staticmethod
def extract_gml_from_zip(zip_path: Path, output_dir: Path) -> Path:
"""
Extract GML file from ZIP archive.
-
+
Args:
zip_path: Path to ZIP file
output_dir: Directory to extract GML file to
-
+
Returns:
Path to extracted GML file
-
+
Raises:
ValueError: If no GML file found in archive
"""
output_dir.mkdir(parents=True, exist_ok=True)
-
+
print(f" Extracting GML from {zip_path}")
-
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
# Find GML file in archive
- gml_files = [f for f in zip_ref.namelist() if f.lower().endswith('.gml')]
-
+ gml_files = [f for f in zip_ref.namelist() if f.lower().endswith(".gml")]
+
if not gml_files:
raise ValueError(f"No GML file found in {zip_path}")
-
+
gml_filename = gml_files[0]
print(f" Found: {gml_filename}")
-
+
# Extract to output directory
zip_ref.extract(gml_filename, output_dir)
-
+
gml_path = output_dir / gml_filename
size_mb = gml_path.stat().st_size / (1024 * 1024)
print(f" Extracted: {gml_path} ({size_mb:.1f} MB)")
-
+
return gml_path
diff --git a/local_testing/main.py b/local_testing/main.py
index 34b5c586..151aa0dc 100644
--- a/local_testing/main.py
+++ b/local_testing/main.py
@@ -35,10 +35,10 @@
def parse_phase_selection(phases_str: str) -> set:
"""
Parse phase selection string into set of phase numbers.
-
+
Args:
phases_str: Comma-separated phase numbers or ranges (e.g., "1,2,9" or "1-5,9")
-
+
Returns:
Set of selected phase numbers, or None if invalid
"""
@@ -53,11 +53,11 @@ def parse_phase_selection(phases_str: str) -> set:
else:
# Single phase: "9"
phases.add(int(part))
-
+
# Validate phase numbers (1-26)
if any(p < 1 or p > 26 for p in phases):
return None
-
+
return phases
except (ValueError, AttributeError):
return None
@@ -261,20 +261,20 @@ def main():
fact_records=results["facts"],
transform_time=results.get("transform_time", 0),
)
-
+
# Run Polars pipeline for comparison if requested
if args.compare:
print("\n Running Polars pipeline for comparison...")
from polars_phases import run_polars_pipeline, PolarsPhaseMetrics
-
+
# Define required parameters
field_datatype_map = {"geometry": "text"} # Simplified for now
intermediate_fieldnames = ["entity", "name", "geometry", "organisation"]
factor_fieldnames = ["entity", "fact"]
-
+
polars_harmonised = output_dir / f"{la_name}_polars_harmonised.csv"
polars_facts = output_dir / f"{la_name}_polars_facts.csv"
-
+
polars_start = time.time()
polars_metrics, polars_harm_count, polars_fact_count = run_polars_pipeline(
input_csv=output_path,
@@ -287,11 +287,12 @@ def main():
selected_phases=selected_phases, # Pass phase selection to Polars
)
polars_end = time.time()
-
+
# Store Polars metrics in report
report.polars_phases = []
for metric in polars_metrics:
from pipeline_report import PhaseMetrics
+
phase_metric = PhaseMetrics(
name=metric.name,
phase_number=metric.phase_number,
@@ -302,12 +303,16 @@ def main():
output_count=metric.output_count,
)
report.polars_phases.append(phase_metric)
-
+
report.polars_harmonised_records = polars_harm_count
report.polars_fact_records = polars_fact_count
report.polars_transform_seconds = polars_end - polars_start
-
- speedup = results.get("transform_time", 0) / report.polars_transform_seconds if report.polars_transform_seconds > 0 else 0
+
+ speedup = (
+ results.get("transform_time", 0) / report.polars_transform_seconds
+ if report.polars_transform_seconds > 0
+ else 0
+ )
print(f" Polars transform time: {report.polars_transform_seconds:.3f}s")
print(f" Speedup: {speedup:.1f}x faster")
diff --git a/local_testing/pipeline_report.py b/local_testing/pipeline_report.py
index cb4b9c26..189f1447 100644
--- a/local_testing/pipeline_report.py
+++ b/local_testing/pipeline_report.py
@@ -141,16 +141,22 @@ def to_dict(self) -> Dict:
phases_to_output = self.phases
polars_phases_to_output = self.polars_phases
if self.selected_phases:
- phases_to_output = [p for p in self.phases if p.phase_number in self.selected_phases]
- polars_phases_to_output = [p for p in self.polars_phases if p.phase_number in self.selected_phases]
-
+ phases_to_output = [
+ p for p in self.phases if p.phase_number in self.selected_phases
+ ]
+ polars_phases_to_output = [
+ p for p in self.polars_phases if p.phase_number in self.selected_phases
+ ]
+
return {
"run_id": self.run_id,
"timestamp": self.timestamp,
"local_authority": self.local_authority,
"dataset": self.dataset,
"record_limit": self.record_limit,
- "selected_phases": list(sorted(self.selected_phases)) if self.selected_phases else None,
+ "selected_phases": (
+ list(sorted(self.selected_phases)) if self.selected_phases else None
+ ),
"input_records": self.input_records,
"harmonised_records": self.harmonised_records,
"fact_records": self.fact_records,
@@ -269,7 +275,7 @@ def generate_text_report(self) -> str:
lines.append("=" * 100)
lines.append("PHASE-BY-PHASE COMPARISON: ORIGINAL vs POLARS")
lines.append("=" * 100)
-
+
# Show phase selection info if applicable
if self.selected_phases:
lines.append(f"Running selected phases: {sorted(self.selected_phases)}")
@@ -287,7 +293,9 @@ def generate_text_report(self) -> str:
# Filter phases if selection is active
phases_to_display = self.phases
if self.selected_phases:
- phases_to_display = [p for p in self.phases if p.phase_number in self.selected_phases]
+ phases_to_display = [
+ p for p in self.phases if p.phase_number in self.selected_phases
+ ]
total_original = 0.0
total_polars = 0.0
@@ -339,12 +347,12 @@ def generate_text_report(self) -> str:
lines.append("-" * 100)
lines.append("ORIGINAL PIPELINE - PHASE TIMING (Row-by-Row)")
lines.append("-" * 100)
-
+
# Show phase selection info if applicable
if self.selected_phases:
lines.append(f"Running selected phases: {sorted(self.selected_phases)}")
lines.append("")
-
+
lines.append(
f"{'#':<4} {'Phase Name':<30} {'Duration':>12} {'% of Transform':>14} {'Output':>10}"
)
@@ -353,7 +361,9 @@ def generate_text_report(self) -> str:
# Filter phases if selection is active
phases_to_display = self.phases
if self.selected_phases:
- phases_to_display = [p for p in self.phases if p.phase_number in self.selected_phases]
+ phases_to_display = [
+ p for p in self.phases if p.phase_number in self.selected_phases
+ ]
for phase in phases_to_display:
pct = (
@@ -375,12 +385,14 @@ def generate_text_report(self) -> str:
lines.append("-" * 100)
lines.append("TOP 5 SLOWEST PHASES (Original Pipeline)")
lines.append("-" * 100)
-
+
# Filter phases for "top slowest" if selection is active
phases_for_top5 = self.phases
if self.selected_phases:
- phases_for_top5 = [p for p in self.phases if p.phase_number in self.selected_phases]
-
+ phases_for_top5 = [
+ p for p in self.phases if p.phase_number in self.selected_phases
+ ]
+
sorted_phases = sorted(
phases_for_top5, key=lambda x: x.duration_seconds, reverse=True
)[:5]
@@ -404,7 +416,9 @@ def generate_text_report(self) -> str:
# Filter phases for speedup calculation if selection is active
phases_for_speedup = self.phases
if self.selected_phases:
- phases_for_speedup = [p for p in self.phases if p.phase_number in self.selected_phases]
+ phases_for_speedup = [
+ p for p in self.phases if p.phase_number in self.selected_phases
+ ]
polars_by_name = {p.name: p for p in self.polars_phases}
speedups = []
@@ -465,13 +479,13 @@ def generate_text_report(self) -> str:
def save_json(self, path: Path):
"""Save report as JSON file."""
import json
-
+
path.parent.mkdir(parents=True, exist_ok=True)
- with open(path, 'w') as f:
+ with open(path, "w") as f:
json.dump(self.to_dict(), f, indent=2)
-
+
def save_text(self, path: Path):
"""Save report as text file."""
path.parent.mkdir(parents=True, exist_ok=True)
- with open(path, 'w') as f:
+ with open(path, "w") as f:
f.write(self.generate_text_report())
diff --git a/local_testing/run_all.py b/local_testing/run_all.py
index 19856b00..55bd3984 100755
--- a/local_testing/run_all.py
+++ b/local_testing/run_all.py
@@ -59,9 +59,11 @@ def main():
# Calculate batch metrics
batch_duration = time.time() - batch_start
- avg_duration = sum(t["duration"] for t in la_times) / len(la_times) if la_times else 0
+ avg_duration = (
+ sum(t["duration"] for t in la_times) / len(la_times) if la_times else 0
+ )
successful_times = [t["duration"] for t in la_times if t["status"] == "success"]
-
+
# Summary
print(f"\n{'='*60}")
print("BATCH PROCESSING COMPLETE (with Polars Comparison)")
@@ -85,7 +87,7 @@ def main():
reports_dir = Path(__file__).parent / "reports"
reports_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-
+
batch_report = {
"batch_timestamp": timestamp,
"total_las": len(endpoints),
@@ -96,13 +98,13 @@ def main():
"polars_comparison_enabled": True,
"limit": limit,
"la_results": la_times,
- "errors": errors
+ "errors": errors,
}
-
+
batch_json = reports_dir / f"batch_{timestamp}_summary.json"
with open(batch_json, "w") as f:
json.dump(batch_report, f, indent=2)
-
+
print(f"\nBatch report saved: {batch_json}")
print(f"{'='*60}\n")
From f6aca1f4964704175f84b980e04dbee13e128dc0 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 9 Feb 2026 14:37:22 +0000
Subject: [PATCH 13/15] =?UTF-8?q?style:=20flake8=20improve=20code=20format?=
=?UTF-8?q?ting=20and=20readability=20in=20GML=20converter,=20main,=20pipe?=
=?UTF-8?q?line=20report,=20pipeline=20runner,=20and=20run=5Fall=20scripts?=
=?UTF-8?q?=20Rapid=20local=20performance=20test=20environment=20supportin?=
=?UTF-8?q?g=20the=20Polars=E2=80=91based=20transformation=20rewrite=20in?=
=?UTF-8?q?=20digital-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
local_testing/gml_converter.py | 28 ++++++++++++++--------------
local_testing/main.py | 4 ++--
local_testing/pipeline_report.py | 6 ++++--
local_testing/pipeline_runner.py | 4 ++--
local_testing/run_all.py | 6 +++---
5 files changed, 25 insertions(+), 23 deletions(-)
diff --git a/local_testing/gml_converter.py b/local_testing/gml_converter.py
index 1ece02f1..c35fbba7 100644
--- a/local_testing/gml_converter.py
+++ b/local_testing/gml_converter.py
@@ -340,20 +340,20 @@ def convert_to_parquet_duckdb(
limit_clause = f"LIMIT {limit}" if limit else ""
query = f"""
- SELECT
+ SELECT
INSPIREID as reference,
INSPIREID as name,
NATIONALCADASTRALREFERENCE as "national-cadastral-reference",
ST_AsText(ST_Transform(geom, 'EPSG:27700', 'EPSG:4326')) as geometry,
- CASE
- WHEN VALIDFROM IS NOT NULL
+ CASE
+ WHEN VALIDFROM IS NOT NULL
THEN strftime(CAST(VALIDFROM AS DATE), '%Y-%m-%d')
- ELSE NULL
+ ELSE NULL
END as "start-date",
- CASE
- WHEN BEGINLIFESPANVERSION IS NOT NULL
+ CASE
+ WHEN BEGINLIFESPANVERSION IS NOT NULL
THEN strftime(CAST(BEGINLIFESPANVERSION AS DATE), '%Y-%m-%d')
- ELSE NULL
+ ELSE NULL
END as "entry-date",
NULL as "end-date",
'title-boundary' as prefix,
@@ -452,20 +452,20 @@ def convert_to_csv_duckdb(
limit_clause = f"LIMIT {limit}" if limit else ""
query = f"""
- SELECT
+ SELECT
INSPIREID as reference,
INSPIREID as name,
NATIONALCADASTRALREFERENCE as "national-cadastral-reference",
ST_AsText(ST_Transform(geom, 'EPSG:27700', 'EPSG:4326')) as geometry,
- CASE
- WHEN VALIDFROM IS NOT NULL
+ CASE
+ WHEN VALIDFROM IS NOT NULL
THEN strftime(CAST(VALIDFROM AS DATE), '%Y-%m-%d')
- ELSE NULL
+ ELSE NULL
END as "start-date",
- CASE
- WHEN BEGINLIFESPANVERSION IS NOT NULL
+ CASE
+ WHEN BEGINLIFESPANVERSION IS NOT NULL
THEN strftime(CAST(BEGINLIFESPANVERSION AS DATE), '%Y-%m-%d')
- ELSE NULL
+ ELSE NULL
END as "entry-date",
NULL as "end-date",
'title-boundary' as prefix,
diff --git a/local_testing/main.py b/local_testing/main.py
index 151aa0dc..8c99c20e 100644
--- a/local_testing/main.py
+++ b/local_testing/main.py
@@ -265,7 +265,7 @@ def main():
# Run Polars pipeline for comparison if requested
if args.compare:
print("\n Running Polars pipeline for comparison...")
- from polars_phases import run_polars_pipeline, PolarsPhaseMetrics
+ from polars_phases import run_polars_pipeline
# Define required parameters
field_datatype_map = {"geometry": "text"} # Simplified for now
@@ -350,7 +350,7 @@ def main():
print(f"Fact Records: {report.fact_records:,}")
if report.steps:
- print(f"\nStep Summary:")
+ print("\nStep Summary:")
for name, step in report.steps.items():
status = "ā" if step.success else "ā"
print(f" {status} {name:<20} {step.duration_seconds:8.3f}s")
diff --git a/local_testing/pipeline_report.py b/local_testing/pipeline_report.py
index 189f1447..18f26eb8 100644
--- a/local_testing/pipeline_report.py
+++ b/local_testing/pipeline_report.py
@@ -282,9 +282,11 @@ def generate_text_report(self) -> str:
lines.append("")
# Header
- lines.append(
- f"{'#':<3} {'Phase Name':<26} {'Original':>11} {'Polars':>11} {'Speedup':>10} {'Time Saved':>12} {'Orig Out':>10} {'Polars Out':>10}"
+ header = (
+ f"{'#':<3} {'Phase Name':<26} {'Original':>11} {'Polars':>11} "
+ f"{'Speedup':>10} {'Time Saved':>12} {'Orig Out':>10} {'Polars Out':>10}"
)
+ lines.append(header)
lines.append("-" * 100)
# Build lookup for Polars phases by name
diff --git a/local_testing/pipeline_runner.py b/local_testing/pipeline_runner.py
index 3f69ccfd..246737b2 100644
--- a/local_testing/pipeline_runner.py
+++ b/local_testing/pipeline_runner.py
@@ -8,7 +8,7 @@
import time
from pathlib import Path
from datetime import datetime
-from typing import Dict, Optional
+from typing import Dict
from pipeline_config import PipelineConfig
@@ -148,7 +148,7 @@ def run_full_pipeline(
csv_input = input_csv.with_suffix(".csv")
if not csv_input.exists():
- print(f" Converting Parquet to CSV for original pipeline...")
+ print(" Converting Parquet to CSV for original pipeline...")
pl.read_parquet(input_csv).write_csv(csv_input)
input_csv = csv_input
diff --git a/local_testing/run_all.py b/local_testing/run_all.py
index 55bd3984..f58fb13d 100755
--- a/local_testing/run_all.py
+++ b/local_testing/run_all.py
@@ -23,7 +23,7 @@ def main():
print("Fetching endpoint list...")
endpoints = CLI.fetch_endpoint_list()
print(f"Found {len(endpoints)} Local Authorities")
- print(f"Running with Polars comparison enabled\n")
+ print("Running with Polars comparison enabled\n")
success_count = 0
error_count = 0
@@ -76,10 +76,10 @@ def main():
if successful_times:
print(f" Min Time: {min(successful_times):.1f}s")
print(f" Max Time: {max(successful_times):.1f}s")
- print(f"\n Note: All LAs processed with both Original + Polars pipelines")
+ print("\n Note: All LAs processed with both Original + Polars pipelines")
if errors:
- print(f"\nFailed Local Authorities:")
+ print("\nFailed Local Authorities:")
for la in errors:
print(f" - {la}")
From bf2fe7bb8026949698b287df00239e06ae829e97 Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Mon, 9 Feb 2026 14:51:02 +0000
Subject: [PATCH 14/15] =?UTF-8?q?fix:=20improve=20pipeline=20report=20form?=
=?UTF-8?q?atting=20and=20update=20flake8=20ignore=20rules=20for=20consist?=
=?UTF-8?q?ency=20Rapid=20local=20performance=20test=20environment=20suppo?=
=?UTF-8?q?rting=20the=20Polars=E2=80=91based=20transformation=20rewrite?=
=?UTF-8?q?=20in=20digital-land-python=20Fixes=20#475?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
local_testing/pipeline_report.py | 8 ++++++--
setup.cfg | 2 +-
2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/local_testing/pipeline_report.py b/local_testing/pipeline_report.py
index 18f26eb8..847878a4 100644
--- a/local_testing/pipeline_report.py
+++ b/local_testing/pipeline_report.py
@@ -314,9 +314,13 @@ def generate_text_report(self) -> str:
saved = phase.duration_seconds - polars_phase.duration_seconds
speedup_str = f"{speedup:.1f}x" if speedup != float("inf") else "ā"
- lines.append(
- f"{phase.phase_number:<3} {phase.name:<26} {phase.duration_seconds:>9.4f}s {polars_phase.duration_seconds:>9.4f}s {speedup_str:>9} {saved:>10.4f}s {phase.output_count:>10,} {polars_phase.output_count:>10,}"
+ phase_line = (
+ f"{phase.phase_number:<3} {phase.name:<26} "
+ f"{phase.duration_seconds:>9.4f}s {polars_phase.duration_seconds:>9.4f}s "
+ f"{speedup_str:>9} {saved:>10.4f}s {phase.output_count:>10,} "
+ f"{polars_phase.output_count:>10,}"
)
+ lines.append(phase_line)
total_original += phase.duration_seconds
total_polars += polars_phase.duration_seconds
diff --git a/setup.cfg b/setup.cfg
index 80f6adc2..2de2c1af 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
[flake8]
max-line-length = 180
-ignore = E203, W503
+ignore = E203, W503, F541, W291
exclude = .venv,.git,__pycache__,docs/source/conf.py,old,build,dist,.direnv
[pycodestyle]
From 83a5c93eb227f11d33016cedf1d9bea60d5b631d Mon Sep 17 00:00:00 2001
From: mattsancog <214982214+mattsancog@users.noreply.github.com>
Date: Tue, 17 Feb 2026 00:57:14 +0000
Subject: [PATCH 15/15] Add Polars phases for data processing pipeline
- Implemented MapPhase for renaming columns based on a mapping specification.
- Created MigratePhase to rename fields according to the latest specification.
- Added NormalisePhase to clean whitespace and handle null patterns in CSV data.
- Developed OrganisationPhase for looking up organisation values.
- Introduced PatchPhase to apply regex patches to field values.
- Implemented PivotPhase to unpivot entity rows into a series of facts.
- Created EntityPrefixPhase to ensure every entry has a prefix field.
- Added PriorityPhase to deduce the priority of each entry.
- Developed FieldPrunePhase and EntityPrunePhase to reduce columns and remove entries with missing entities.
- Implemented EntityReferencePhase and FactReferencePhase to ensure prefix and reference fields are set correctly.
- Created SavePhase to save the DataFrame to a CSV file.
- Added comprehensive tests for each phase to ensure functionality and correctness. #475
---
digital_land/commands.py | 270 +++++++++-----
digital_land/phase_polars/README.md | 41 ---
digital_land/phase_polars/__init__.py | 87 +++++
digital_land/phase_polars/combine.py | 87 +++++
digital_land/phase_polars/concat.py | 90 +++++
digital_land/phase_polars/convert.py | 226 ++++++++++++
digital_land/phase_polars/default.py | 66 ++++
digital_land/phase_polars/dump.py | 29 ++
digital_land/phase_polars/factor.py | 40 +++
digital_land/phase_polars/filter.py | 32 ++
digital_land/phase_polars/harmonise.py | 229 ++++++++++++
digital_land/phase_polars/load.py | 39 +++
digital_land/phase_polars/load/__init__.py | 0
.../phase_polars/load/save_database.py | 0
digital_land/phase_polars/load/save_file.py | 0
digital_land/phase_polars/lookup.py | 327 +++++++++++++++++
digital_land/phase_polars/map.py | 93 +++++
digital_land/phase_polars/migrate.py | 64 ++++
digital_land/phase_polars/normalise.py | 84 +++++
digital_land/phase_polars/organisation.py | 52 +++
digital_land/phase_polars/patch.py | 87 +++++
digital_land/phase_polars/phase.py | 14 +
digital_land/phase_polars/pivot.py | 70 ++++
digital_land/phase_polars/prefix.py | 30 ++
digital_land/phase_polars/priority.py | 59 ++++
digital_land/phase_polars/prune.py | 86 +++++
digital_land/phase_polars/reference.py | 133 +++++++
digital_land/phase_polars/save.py | 45 +++
.../phase_polars/transform/__init__.py | 0
.../phase_polars/transform/concat_field.py | 0
.../phase_polars/transform/convert.py | 0
.../phase_polars/transform/entity_lookup.py | 0
.../transform/entity_reference.py | 0
.../phase_polars/transform/fact_hash.py | 0
.../phase_polars/transform/field_prune.py | 0
digital_land/phase_polars/transform/filter.py | 0
.../phase_polars/transform/flatten.py | 0
digital_land/phase_polars/transform/map.py | 0
.../phase_polars/transform/migrate.py | 0
.../phase_polars/transform/normalise.py | 0
digital_land/phase_polars/transform/parse.py | 0
digital_land/phase_polars/transform/patch.py | 0
digital_land/phase_polars/transform/pivot.py | 0
.../phase_polars/transform/priority.py | 0
.../transform/resolve_organisation.py | 0
.../phase_polars/transform/set_default.py | 0
.../phase_polars/transform/validate.py | 0
pyproject.toml | 1 +
test_polars_phases.py | 328 ++++++++++++++++++
49 files changed, 2587 insertions(+), 122 deletions(-)
delete mode 100644 digital_land/phase_polars/README.md
create mode 100644 digital_land/phase_polars/combine.py
create mode 100644 digital_land/phase_polars/concat.py
create mode 100644 digital_land/phase_polars/convert.py
create mode 100644 digital_land/phase_polars/default.py
create mode 100644 digital_land/phase_polars/dump.py
create mode 100644 digital_land/phase_polars/factor.py
create mode 100644 digital_land/phase_polars/filter.py
create mode 100644 digital_land/phase_polars/harmonise.py
create mode 100644 digital_land/phase_polars/load.py
delete mode 100644 digital_land/phase_polars/load/__init__.py
delete mode 100644 digital_land/phase_polars/load/save_database.py
delete mode 100644 digital_land/phase_polars/load/save_file.py
create mode 100644 digital_land/phase_polars/lookup.py
create mode 100644 digital_land/phase_polars/map.py
create mode 100644 digital_land/phase_polars/migrate.py
create mode 100644 digital_land/phase_polars/normalise.py
create mode 100644 digital_land/phase_polars/organisation.py
create mode 100644 digital_land/phase_polars/patch.py
create mode 100644 digital_land/phase_polars/phase.py
create mode 100644 digital_land/phase_polars/pivot.py
create mode 100644 digital_land/phase_polars/prefix.py
create mode 100644 digital_land/phase_polars/priority.py
create mode 100644 digital_land/phase_polars/prune.py
create mode 100644 digital_land/phase_polars/reference.py
create mode 100644 digital_land/phase_polars/save.py
delete mode 100644 digital_land/phase_polars/transform/__init__.py
delete mode 100644 digital_land/phase_polars/transform/concat_field.py
delete mode 100644 digital_land/phase_polars/transform/convert.py
delete mode 100644 digital_land/phase_polars/transform/entity_lookup.py
delete mode 100644 digital_land/phase_polars/transform/entity_reference.py
delete mode 100644 digital_land/phase_polars/transform/fact_hash.py
delete mode 100644 digital_land/phase_polars/transform/field_prune.py
delete mode 100644 digital_land/phase_polars/transform/filter.py
delete mode 100644 digital_land/phase_polars/transform/flatten.py
delete mode 100644 digital_land/phase_polars/transform/map.py
delete mode 100644 digital_land/phase_polars/transform/migrate.py
delete mode 100644 digital_land/phase_polars/transform/normalise.py
delete mode 100644 digital_land/phase_polars/transform/parse.py
delete mode 100644 digital_land/phase_polars/transform/patch.py
delete mode 100644 digital_land/phase_polars/transform/pivot.py
delete mode 100644 digital_land/phase_polars/transform/priority.py
delete mode 100644 digital_land/phase_polars/transform/resolve_organisation.py
delete mode 100644 digital_land/phase_polars/transform/set_default.py
delete mode 100644 digital_land/phase_polars/transform/validate.py
create mode 100644 test_polars_phases.py
diff --git a/digital_land/commands.py b/digital_land/commands.py
index 463f0f45..67bb4c14 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -59,6 +59,32 @@
from digital_land.phase.save import SavePhase
from digital_land.pipeline import run_pipeline, Lookups, Pipeline
from digital_land.pipeline.process import convert_tranformed_csv_to_pq
+from digital_land.phase_polars import run_polars_pipeline
+from digital_land.phase_polars import (
+ ConvertPhase as PolarsConvertPhase,
+ NormalisePhase as PolarsNormalisePhase,
+ ConcatFieldPhase as PolarsConcatFieldPhase,
+ FilterPhase as PolarsFilterPhase,
+ MapPhase as PolarsMapPhase,
+ PatchPhase as PolarsPatchPhase,
+ HarmonisePhase as PolarsHarmonisePhase,
+ DefaultPhase as PolarsDefaultPhase,
+ MigratePhase as PolarsMigratePhase,
+ OrganisationPhase as PolarsOrganisationPhase,
+ FieldPrunePhase as PolarsFieldPrunePhase,
+ EntityPrunePhase as PolarsEntityPrunePhase,
+ FactPrunePhase as PolarsFactPrunePhase,
+ EntityReferencePhase as PolarsEntityReferencePhase,
+ FactReferencePhase as PolarsFactReferencePhase,
+ EntityPrefixPhase as PolarsEntityPrefixPhase,
+ EntityLookupPhase as PolarsEntityLookupPhase,
+ FactLookupPhase as PolarsFactLookupPhase,
+ SavePhase as PolarsSavePhase,
+ PivotPhase as PolarsPivotPhase,
+ FactCombinePhase as PolarsFactCombinePhase,
+ FactorPhase as PolarsFactorPhase,
+ PriorityPhase as PolarsPriorityPhase,
+)
from digital_land.schema import Schema
from digital_land.update import add_source_endpoint
from digital_land.configuration.main import Config
@@ -237,6 +263,7 @@ def pipeline_run(
resource=None,
output_log_dir=None,
converted_path=None,
+ use_polars=False,
):
# set up paths
cache_dir = Path(cache_dir)
@@ -302,87 +329,168 @@ def pipeline_run(
if "entry-date" not in default_values:
default_values["entry-date"] = entry_date
- # TODO Migrate all of this into a function in the Pipeline function
- run_pipeline(
- ConvertPhase(
- path=input_path,
- dataset_resource_log=dataset_resource_log,
- converted_resource_log=converted_resource_log,
- output_path=converted_path,
- ),
- NormalisePhase(skip_patterns=skip_patterns),
- ParsePhase(),
- ConcatFieldPhase(concats=concats, log=column_field_log),
- FilterPhase(filters=pipeline.filters(resource)),
- MapPhase(
- fieldnames=intermediate_fieldnames,
- columns=columns,
- log=column_field_log,
- ),
- FilterPhase(filters=pipeline.filters(resource, endpoints=endpoints)),
- PatchPhase(
- issues=issue_log,
- patches=patches,
- ),
- HarmonisePhase(
- field_datatype_map=specification.get_field_datatype_map(),
- issues=issue_log,
- dataset=dataset,
- valid_category_values=valid_category_values,
- ),
- DefaultPhase(
- default_fields=default_fields,
- default_values=default_values,
- issues=issue_log,
- ),
- # TBD: move migrating columns to fields to be immediately after map
- # this will simplify harmonisation and remove intermediate_fieldnames
- # but effects brownfield-land and other pipelines which operate on columns
- MigratePhase(
- fields=specification.schema_field[schema],
- migrations=pipeline.migrations(),
- ),
- OrganisationPhase(organisation=organisation, issues=issue_log),
- FieldPrunePhase(fields=specification.current_fieldnames(schema)),
- EntityReferencePhase(
- dataset=dataset,
- prefix=specification.dataset_prefix(dataset),
- issues=issue_log,
- ),
- EntityPrefixPhase(dataset=dataset),
- EntityLookupPhase(
- lookups=lookups,
- redirect_lookups=redirect_lookups,
- issue_log=issue_log,
- operational_issue_log=operational_issue_log,
- entity_range=[entity_range_min, entity_range_max],
- ),
- SavePhase(
- default_output_path("harmonised", input_path),
- fieldnames=intermediate_fieldnames,
- enabled=save_harmonised,
- ),
- EntityPrunePhase(dataset_resource_log=dataset_resource_log),
- PriorityPhase(config=config, providers=organisations),
- PivotPhase(),
- FactCombinePhase(issue_log=issue_log, fields=combine_fields),
- FactorPhase(),
- FactReferencePhase(
- field_typology_map=specification.get_field_typology_map(),
- field_prefix_map=specification.get_field_prefix_map(),
- ),
- FactLookupPhase(
- lookups=lookups,
- redirect_lookups=redirect_lookups,
- issue_log=issue_log,
- odp_collections=specification.get_odp_collections(),
- ),
- FactPrunePhase(),
- SavePhase(
- output_path,
- fieldnames=specification.factor_fieldnames(),
- ),
- )
+ if use_polars:
+ # āā Polars-based pipeline āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+ run_polars_pipeline(
+ PolarsConvertPhase(
+ path=input_path,
+ dataset_resource_log=dataset_resource_log,
+ converted_resource_log=converted_resource_log,
+ output_path=converted_path,
+ ),
+ PolarsNormalisePhase(skip_patterns=skip_patterns),
+ # ParsePhase is not needed ā ConvertPhase already produces a DataFrame
+ PolarsConcatFieldPhase(concats=concats, log=column_field_log),
+ PolarsFilterPhase(filters=pipeline.filters(resource)),
+ PolarsMapPhase(
+ fieldnames=intermediate_fieldnames,
+ columns=columns,
+ log=column_field_log,
+ ),
+ PolarsFilterPhase(filters=pipeline.filters(resource, endpoints=endpoints)),
+ PolarsPatchPhase(
+ issues=issue_log,
+ patches=patches,
+ ),
+ PolarsHarmonisePhase(
+ field_datatype_map=specification.get_field_datatype_map(),
+ issues=issue_log,
+ dataset=dataset,
+ valid_category_values=valid_category_values,
+ ),
+ PolarsDefaultPhase(
+ default_fields=default_fields,
+ default_values=default_values,
+ issues=issue_log,
+ ),
+ PolarsMigratePhase(
+ fields=specification.schema_field[schema],
+ migrations=pipeline.migrations(),
+ ),
+ PolarsOrganisationPhase(organisation=organisation, issues=issue_log),
+ PolarsFieldPrunePhase(fields=specification.current_fieldnames(schema)),
+ PolarsEntityReferencePhase(
+ dataset=dataset,
+ prefix=specification.dataset_prefix(dataset),
+ issues=issue_log,
+ ),
+ PolarsEntityPrefixPhase(dataset=dataset),
+ PolarsEntityLookupPhase(
+ lookups=lookups,
+ redirect_lookups=redirect_lookups,
+ issue_log=issue_log,
+ operational_issue_log=operational_issue_log,
+ entity_range=[entity_range_min, entity_range_max],
+ ),
+ PolarsSavePhase(
+ default_output_path("harmonised", input_path),
+ fieldnames=intermediate_fieldnames,
+ enabled=save_harmonised,
+ ),
+ PolarsEntityPrunePhase(dataset_resource_log=dataset_resource_log),
+ PolarsPriorityPhase(config=config, providers=organisations),
+ PolarsPivotPhase(),
+ PolarsFactCombinePhase(issue_log=issue_log, fields=combine_fields),
+ PolarsFactorPhase(),
+ PolarsFactReferencePhase(
+ field_typology_map=specification.get_field_typology_map(),
+ field_prefix_map=specification.get_field_prefix_map(),
+ ),
+ PolarsFactLookupPhase(
+ lookups=lookups,
+ redirect_lookups=redirect_lookups,
+ issue_log=issue_log,
+ odp_collections=specification.get_odp_collections(),
+ ),
+ PolarsFactPrunePhase(),
+ PolarsSavePhase(
+ output_path,
+ fieldnames=specification.factor_fieldnames(),
+ ),
+ )
+ else:
+ # āā Original streaming pipeline āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+ # TODO Migrate all of this into a function in the Pipeline function
+ run_pipeline(
+ ConvertPhase(
+ path=input_path,
+ dataset_resource_log=dataset_resource_log,
+ converted_resource_log=converted_resource_log,
+ output_path=converted_path,
+ ),
+ NormalisePhase(skip_patterns=skip_patterns),
+ ParsePhase(),
+ ConcatFieldPhase(concats=concats, log=column_field_log),
+ FilterPhase(filters=pipeline.filters(resource)),
+ MapPhase(
+ fieldnames=intermediate_fieldnames,
+ columns=columns,
+ log=column_field_log,
+ ),
+ FilterPhase(filters=pipeline.filters(resource, endpoints=endpoints)),
+ PatchPhase(
+ issues=issue_log,
+ patches=patches,
+ ),
+ HarmonisePhase(
+ field_datatype_map=specification.get_field_datatype_map(),
+ issues=issue_log,
+ dataset=dataset,
+ valid_category_values=valid_category_values,
+ ),
+ DefaultPhase(
+ default_fields=default_fields,
+ default_values=default_values,
+ issues=issue_log,
+ ),
+ # TBD: move migrating columns to fields to be immediately after map
+ # this will simplify harmonisation and remove intermediate_fieldnames
+ # but effects brownfield-land and other pipelines which operate on columns
+ MigratePhase(
+ fields=specification.schema_field[schema],
+ migrations=pipeline.migrations(),
+ ),
+ OrganisationPhase(organisation=organisation, issues=issue_log),
+ FieldPrunePhase(fields=specification.current_fieldnames(schema)),
+ EntityReferencePhase(
+ dataset=dataset,
+ prefix=specification.dataset_prefix(dataset),
+ issues=issue_log,
+ ),
+ EntityPrefixPhase(dataset=dataset),
+ EntityLookupPhase(
+ lookups=lookups,
+ redirect_lookups=redirect_lookups,
+ issue_log=issue_log,
+ operational_issue_log=operational_issue_log,
+ entity_range=[entity_range_min, entity_range_max],
+ ),
+ SavePhase(
+ default_output_path("harmonised", input_path),
+ fieldnames=intermediate_fieldnames,
+ enabled=save_harmonised,
+ ),
+ EntityPrunePhase(dataset_resource_log=dataset_resource_log),
+ PriorityPhase(config=config, providers=organisations),
+ PivotPhase(),
+ FactCombinePhase(issue_log=issue_log, fields=combine_fields),
+ FactorPhase(),
+ FactReferencePhase(
+ field_typology_map=specification.get_field_typology_map(),
+ field_prefix_map=specification.get_field_prefix_map(),
+ ),
+ FactLookupPhase(
+ lookups=lookups,
+ redirect_lookups=redirect_lookups,
+ issue_log=issue_log,
+ odp_collections=specification.get_odp_collections(),
+ ),
+ FactPrunePhase(),
+ SavePhase(
+ output_path,
+ fieldnames=specification.factor_fieldnames(),
+ ),
+ )
# In the FactCombinePhase, when combine_fields has some values, we check for duplicates and combine values.
# If we have done this then we will not call duplicate_reference_check as we have already carried out a
diff --git a/digital_land/phase_polars/README.md b/digital_land/phase_polars/README.md
deleted file mode 100644
index 853f2fbf..00000000
--- a/digital_land/phase_polars/README.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# Phases
-
-This directory contains transformation phases used in the digital-land data pipeline. Phases are modular processing steps that transform and validate data.
-
-## Transform Phases
-
-The `transform` folder contains the core data transformation phases executed in sequence:
-
-### Data Transformation Pipeline
-
-1. **01_convert.py** - Convert data types and formats
-2. **02_normalise.py** - Normalize data values and structure
-3. **03_parse.py** - Parse and extract data from raw inputs
-4. **04_concat_field.py** - Concatenate multiple fields
-5. **05_filter.py** - Filter records based on criteria
-6. **06_map.py** - Map values between different formats
-7. **07_patch.py** - Apply patches to data records
-8. **08_validate.py** - Validate data against schema
-9. **09_set_default.py** - Set default values for missing data
-10. **10_migrate.py** - Migrate data structure/format
-11. **11_resolve_organisation.py** - Resolve and enrich organisation references
-12. **12_field_prune.py** - Remove unnecessary fields
-13. **13_entity_reference.py** - Handle entity references
-14. **14_entity_lookup.py** - Lookup and enrich entity data
-15. **15_pivot.py** - Pivot data structure
-16. **16_fact_hash.py** - Generate fact hashes for deduplication
-17. **17_flatten.py** - Flatten nested data structures
-
-## Load Phases
-
-The `load` folder contains phases for saving and storing data:
-
-1. **01_save_file.py** - Save data to file storage
-2. **02_save_database.py** - Save data to database
-
-## Overview
-
-Each phase is designed to be:
-- **Modular** - Can be used independently or in sequence
-- **Configurable** - Parameters can be customized via configuration
-- **Reusable** - Shared across different pipelines and workflows
diff --git a/digital_land/phase_polars/__init__.py b/digital_land/phase_polars/__init__.py
index e69de29b..50a7e1d5 100644
--- a/digital_land/phase_polars/__init__.py
+++ b/digital_land/phase_polars/__init__.py
@@ -0,0 +1,87 @@
+"""
+Polars-based pipeline phases.
+
+Drop-in replacements for the streaming phases in `digital_land.phase`.
+Each phase accepts and returns a `polars.DataFrame` instead of a generator.
+"""
+
+import logging
+
+import polars as pl
+
+from .phase import PolarsPhase
+from .convert import ConvertPhase
+from .normalise import NormalisePhase
+from .concat import ConcatFieldPhase
+from .filter import FilterPhase
+from .map import MapPhase
+from .patch import PatchPhase
+from .harmonise import HarmonisePhase
+from .default import DefaultPhase
+from .migrate import MigratePhase
+from .organisation import OrganisationPhase
+from .prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase
+from .reference import EntityReferencePhase, FactReferencePhase
+from .prefix import EntityPrefixPhase
+from .lookup import EntityLookupPhase, FactLookupPhase, PrintLookupPhase
+from .save import SavePhase
+from .pivot import PivotPhase
+from .combine import FactCombinePhase
+from .factor import FactorPhase
+from .priority import PriorityPhase
+from .dump import DumpPhase
+from .load import LoadPhase
+
+logger = logging.getLogger(__name__)
+
+
+def run_polars_pipeline(*phases):
+ """
+ Run a sequence of Polars phases.
+
+ Each phase receives the DataFrame output of the previous phase.
+ The first phase typically starts from ``df=None`` and creates
+ the initial DataFrame (e.g. ConvertPhase).
+ """
+ df = None
+ for phase in phases:
+ logger.debug(f"running polars phase {phase.__class__.__name__}")
+ df = phase.process(df)
+ if df is not None:
+ logger.debug(
+ f" -> {phase.__class__.__name__} produced {df.height} rows, "
+ f"{len([c for c in df.columns if not c.startswith('__')])} data cols"
+ )
+ return df
+
+
+__all__ = [
+ "PolarsPhase",
+ "ConvertPhase",
+ "NormalisePhase",
+ "ConcatFieldPhase",
+ "FilterPhase",
+ "MapPhase",
+ "PatchPhase",
+ "HarmonisePhase",
+ "DefaultPhase",
+ "MigratePhase",
+ "OrganisationPhase",
+ "FieldPrunePhase",
+ "EntityPrunePhase",
+ "FactPrunePhase",
+ "EntityReferencePhase",
+ "FactReferencePhase",
+ "EntityPrefixPhase",
+ "EntityLookupPhase",
+ "FactLookupPhase",
+ "PrintLookupPhase",
+ "SavePhase",
+ "PivotPhase",
+ "FactCombinePhase",
+ "FactorPhase",
+ "PriorityPhase",
+ "DumpPhase",
+ "LoadPhase",
+ "run_polars_pipeline",
+]
diff --git a/digital_land/phase_polars/combine.py b/digital_land/phase_polars/combine.py
new file mode 100644
index 00000000..db4d17da
--- /dev/null
+++ b/digital_land/phase_polars/combine.py
@@ -0,0 +1,87 @@
+from copy import deepcopy
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+try:
+ from shapely.ops import unary_union
+ from shapely.geometry import MultiPolygon
+ import shapely.wkt
+ from digital_land.datatype.wkt import dump_wkt
+
+ HAS_SHAPELY = True
+except ImportError:
+ HAS_SHAPELY = False
+
+
+def combine_geometries(wkts, precision=6):
+ geometries = [shapely.wkt.loads(x) for x in wkts]
+ union = unary_union(geometries)
+ if not isinstance(union, MultiPolygon):
+ union = MultiPolygon([union])
+ return dump_wkt(union, precision=precision)
+
+
+class FactCombinePhase(PolarsPhase):
+ """
+ Combine field values from multiple facts for the same entity.
+ """
+
+ def __init__(self, issue_log=None, fields=None):
+ if fields is None:
+ fields = {}
+ self.issues = issue_log
+ self.fields = fields
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0 or not self.fields:
+ return df
+
+ if "field" not in df.columns or "entity" not in df.columns:
+ return df
+
+ combine_field_names = set(self.fields.keys()) if isinstance(self.fields, dict) else set(self.fields)
+
+ # Split into combinable and non-combinable
+ mask = pl.col("field").is_in(list(combine_field_names))
+ pass_through = df.filter(~mask)
+ to_combine = df.filter(mask)
+
+ if to_combine.height == 0:
+ return pass_through
+
+ # Group by entity + field and combine values
+ combined_rows = []
+ for (entity, field), group_df in to_combine.group_by(["entity", "field"]):
+ values = [
+ v
+ for v in group_df["value"].to_list()
+ if v is not None and v != ""
+ ]
+ values = sorted(set(values))
+
+ if field == "geometry" and HAS_SHAPELY and values:
+ combined_value = combine_geometries(values)
+ elif isinstance(self.fields, dict) and field in self.fields:
+ separator = self.fields[field]
+ combined_value = separator.join(values)
+ else:
+ combined_value = ";".join(values)
+
+ # Emit rows for each original row in the group
+ for row in group_df.iter_rows(named=True):
+ if self.issues:
+ self.issues.line_number = row.get("line-number", row.get("__line_number", ""))
+ self.issues.entry_number = row.get("entry-number", row.get("__entry_number", ""))
+ self.issues.log_issue(field, "combined-value", entity)
+
+ new_row = dict(row)
+ new_row["value"] = combined_value
+ combined_rows.append(new_row)
+
+ if combined_rows:
+ combined_df = pl.DataFrame(combined_rows, schema=df.schema)
+ return pl.concat([pass_through, combined_df])
+
+ return pass_through
diff --git a/digital_land/phase_polars/concat.py b/digital_land/phase_polars/concat.py
new file mode 100644
index 00000000..111c2bd9
--- /dev/null
+++ b/digital_land/phase_polars/concat.py
@@ -0,0 +1,90 @@
+import itertools
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class ConcatFieldPhase(PolarsPhase):
+ """
+ Concatenate multiple source fields into a single destination field.
+ """
+
+ def __init__(self, concats=None, log=None):
+ if concats is None:
+ concats = {}
+ self.concats = concats
+
+ if log:
+ for fieldname, cat in self.concats.items():
+ log.add(
+ fieldname,
+ cat["prepend"]
+ + cat["separator"].join(cat["fields"])
+ + cat["append"],
+ )
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0 or not self.concats:
+ return df
+
+ for fieldname, cat in self.concats.items():
+ prepend = cat["prepend"]
+ separator = cat["separator"]
+ append = cat["append"]
+ source_fields = cat["fields"]
+
+ # Ensure the destination column exists
+ if fieldname not in df.columns:
+ df = df.with_columns(pl.lit("").alias(fieldname))
+
+ # Build list of expressions for values to concatenate
+ # Start with the existing field value, then add source fields
+ parts = [pl.col(fieldname).fill_null("")]
+ for h in source_fields:
+ if h in df.columns:
+ parts.append(
+ pl.when(
+ pl.col(h).is_not_null()
+ & (pl.col(h).str.strip_chars() != "")
+ )
+ .then(pl.col(h))
+ .otherwise(pl.lit(None))
+ )
+
+ # Filter out nulls and join with separator, then wrap with prepend/append
+ def _concat_row(row_vals):
+ filtered = [v for v in row_vals if v is not None and v != ""]
+ body = separator.join(filtered)
+ return prepend + body + append
+
+ # Use struct + map_elements for the concatenation logic
+ struct_cols = []
+ temp_names = []
+ for i, part in enumerate(parts):
+ name = f"__concat_part_{i}"
+ temp_names.append(name)
+ struct_cols.append(part.alias(name))
+
+ df = df.with_columns(struct_cols)
+
+ df = df.with_columns(
+ pl.struct(temp_names)
+ .map_elements(
+ lambda s, sep=separator, pre=prepend, app=append: (
+ pre
+ + sep.join(
+ v
+ for v in s.values()
+ if v is not None and str(v).strip() != ""
+ )
+ + app
+ ),
+ return_dtype=pl.Utf8,
+ )
+ .alias(fieldname)
+ )
+
+ df = df.drop(temp_names)
+
+ return df
diff --git a/digital_land/phase_polars/convert.py b/digital_land/phase_polars/convert.py
new file mode 100644
index 00000000..35380def
--- /dev/null
+++ b/digital_land/phase_polars/convert.py
@@ -0,0 +1,226 @@
+import csv
+import logging
+import os
+import tempfile
+import time
+from pathlib import Path
+
+import polars as pl
+
+from .phase import PolarsPhase
+from ..phase.convert import (
+ ConversionError,
+ convert_features_to_csv,
+ convert_json_to_csv,
+ detect_file_encoding,
+ read_csv,
+ read_excel,
+)
+from ..log import ConvertedResourceLog
+
+import sqlite3
+import zipfile
+
+logger = logging.getLogger(__name__)
+
+
+class ConvertPhase(PolarsPhase):
+ """
+ Detect and convert input file format then load into a Polars DataFrame.
+
+ Re-uses the existing format-detection and conversion helpers so the
+ behaviour is identical to the streaming ConvertPhase.
+ """
+
+ def __init__(
+ self,
+ path=None,
+ dataset_resource_log=None,
+ converted_resource_log=None,
+ output_path=None,
+ ):
+ self.path = path
+ self.dataset_resource_log = dataset_resource_log
+ self.converted_resource_log = converted_resource_log
+ self.charset = ""
+ self.output_path = output_path
+ if output_path:
+ output_dir = os.path.dirname(str(output_path))
+ if output_dir and not os.path.exists(output_dir):
+ os.makedirs(output_dir)
+
+ def _resource_from_path(self, path):
+ return Path(path).stem
+
+ def _find_zip_file(self, input_file, suffix=".gml"):
+ zip_ = zipfile.ZipFile(input_file)
+ files = zip_.namelist()
+ files = list(
+ set(
+ filter(
+ lambda s: s.endswith(suffix) or s.endswith(suffix.upper()), files
+ )
+ )
+ )
+ if not files or not len(files):
+ return None
+ if len(files) > 1:
+ raise ValueError("Zipfile contains more than one %s file" % suffix)
+ return "/" + files[0]
+
+ def find_internal_path(self, input_path):
+ for suffix, mime in [
+ (".shp", "x-gis/x-shapefile"),
+ (".gml", "application/gml+xml"),
+ (".tab", "x-gis/x-mapinfo-tab"),
+ (".geojson", "application/vnd.geo+json"),
+ (".json", "application/vnd.geo+json"),
+ (".kml", "application/vnd.google-earth.kml+xml"),
+ ]:
+ internal_path = self._find_zip_file(input_path, suffix)
+ if internal_path:
+ return internal_path, mime
+ return None, None
+
+ def _get_csv_path(self, input_path):
+ """Return (csv_path, should_delete_temp) by converting the input to CSV if needed."""
+
+ # Try binary formats first
+ excel = read_excel(input_path)
+ if excel is not None:
+ logger.debug(f"{input_path} looks like excel")
+ if self.dataset_resource_log:
+ self.dataset_resource_log.mime_type = "application/vnd.ms-excel"
+ tmp = self.output_path or tempfile.NamedTemporaryFile(
+ suffix=".csv", delete=False
+ ).name
+ excel.to_csv(
+ str(tmp), index=False, header=True, encoding="utf-8", quoting=csv.QUOTE_ALL
+ )
+ return str(tmp), False
+
+ if zipfile.is_zipfile(input_path):
+ logger.debug(f"{input_path} looks like zip")
+ if self.dataset_resource_log:
+ self.dataset_resource_log.mime_type = "application/zip"
+ internal_path, mime_type = self.find_internal_path(input_path)
+ if internal_path:
+ if self.dataset_resource_log:
+ self.dataset_resource_log.internal_path = internal_path
+ self.dataset_resource_log.internal_mime_type = mime_type
+ parent = str(self.output_path.parent) if self.output_path else None
+ tmp = tempfile.NamedTemporaryFile(suffix=".zip", dir=parent).name
+ os.link(input_path, tmp)
+ zip_path = f"/vsizip/{tmp}{internal_path}"
+ csv_path = convert_features_to_csv(zip_path, self.output_path)
+ return csv_path, False
+
+ try:
+ conn = sqlite3.connect(input_path)
+ cursor = conn.cursor()
+ cursor.execute("pragma quick_check")
+ conn.close()
+ logger.debug(f"{input_path} looks like SQLite")
+ if self.dataset_resource_log:
+ self.dataset_resource_log.mime_type = "application/geopackage+sqlite3"
+ csv_path = convert_features_to_csv(input_path, self.output_path)
+ return csv_path, False
+ except Exception:
+ pass
+
+ # Text-based formats
+ encoding = detect_file_encoding(input_path)
+ if not encoding:
+ raise ConversionError(f"Cannot detect encoding for {input_path}")
+
+ self.charset = ";charset=" + encoding
+ with open(input_path, encoding=encoding) as f:
+ content = f.read(10)
+
+ if content.lower().startswith(" pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ # Apply default_fields: if field is empty, copy from another field
+ for field, default_field in self.default_fields.items():
+ if default_field not in df.columns:
+ continue
+ if field not in df.columns:
+ df = df.with_columns(pl.lit("").alias(field))
+
+ df = df.with_columns(
+ pl.when(
+ pl.col(field).is_null()
+ | (pl.col(field) == "")
+ )
+ .then(
+ pl.when(
+ pl.col(default_field).is_not_null()
+ & (pl.col(default_field) != "")
+ )
+ .then(pl.col(default_field))
+ .otherwise(pl.col(field))
+ )
+ .otherwise(pl.col(field))
+ .alias(field)
+ )
+
+ # Apply default_values: if field is empty, use a fixed default value
+ for field, value in self.default_values.items():
+ if not value:
+ continue
+
+ if field not in df.columns:
+ df = df.with_columns(pl.lit("").alias(field))
+
+ df = df.with_columns(
+ pl.when(
+ pl.col(field).is_null()
+ | (pl.col(field) == "")
+ )
+ .then(pl.lit(value))
+ .otherwise(pl.col(field))
+ .alias(field)
+ )
+
+ return df
diff --git a/digital_land/phase_polars/dump.py b/digital_land/phase_polars/dump.py
new file mode 100644
index 00000000..cd4ff3ce
--- /dev/null
+++ b/digital_land/phase_polars/dump.py
@@ -0,0 +1,29 @@
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class DumpPhase(PolarsPhase):
+ """
+ Dump raw data to a CSV file (for the ConvertPhase output).
+ """
+
+ def __init__(self, path=None, f=None, enabled=True):
+ self.path = path
+ self.f = f
+ self.enabled = enabled
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if not self.enabled or df is None or df.height == 0:
+ return df
+
+ data_cols = [c for c in df.columns if not c.startswith("__")]
+ out_df = df.select(data_cols)
+
+ if self.f:
+ csv_str = out_df.write_csv()
+ self.f.write(csv_str)
+ elif self.path:
+ out_df.write_csv(str(self.path))
+
+ return df
diff --git a/digital_land/phase_polars/factor.py b/digital_land/phase_polars/factor.py
new file mode 100644
index 00000000..7ef53295
--- /dev/null
+++ b/digital_land/phase_polars/factor.py
@@ -0,0 +1,40 @@
+import hashlib
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+def fact_hash(entity, field, value):
+ data = entity + ":" + field + ":" + value
+ return hashlib.sha256(data.encode("utf-8")).hexdigest()
+
+
+class FactorPhase(PolarsPhase):
+ """
+ Add a fact hash identifier for each fact row.
+ """
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ if not all(c in df.columns for c in ["entity", "field", "value"]):
+ return df
+
+ df = df.with_columns(
+ pl.struct(["entity", "field", "value"])
+ .map_elements(
+ lambda s: fact_hash(
+ str(s["entity"] or ""),
+ str(s["field"] or ""),
+ str(s["value"] or ""),
+ )
+ if s["entity"]
+ else "",
+ return_dtype=pl.Utf8,
+ )
+ .alias("fact")
+ )
+
+ return df
diff --git a/digital_land/phase_polars/filter.py b/digital_land/phase_polars/filter.py
new file mode 100644
index 00000000..4eac4358
--- /dev/null
+++ b/digital_land/phase_polars/filter.py
@@ -0,0 +1,32 @@
+import re
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class FilterPhase(PolarsPhase):
+ """
+ Filter rows based on regex patterns applied to field values.
+ Only rows where *all* filter patterns match are kept.
+ """
+
+ def __init__(self, filters=None):
+ if filters is None:
+ filters = {}
+ self.filters = {}
+ for field, pattern in filters.items():
+ self.filters[field] = re.compile(pattern)
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0 or not self.filters:
+ return df
+
+ mask = pl.lit(True)
+ for field, pattern in self.filters.items():
+ if field in df.columns:
+ mask = mask & pl.col(field).fill_null("").str.contains(
+ f"^(?:{pattern.pattern})"
+ )
+
+ return df.filter(mask)
diff --git a/digital_land/phase_polars/harmonise.py b/digital_land/phase_polars/harmonise.py
new file mode 100644
index 00000000..204f7e4a
--- /dev/null
+++ b/digital_land/phase_polars/harmonise.py
@@ -0,0 +1,229 @@
+import logging
+from datetime import datetime, date
+from calendar import monthrange
+
+import polars as pl
+
+from .phase import PolarsPhase
+from digital_land.datatype.point import PointDataType
+from digital_land.datatype.factory import datatype_factory
+
+try:
+ import shapely.wkt
+except ImportError:
+ shapely = None
+
+logger = logging.getLogger(__name__)
+
+MANDATORY_FIELDS_DICT = {
+ "article-4-direction": [
+ "reference", "name", "document-url", "documentation-url",
+ ],
+ "article-4-direction-area": [
+ "reference", "geometry", "name", "permitted-development-rights",
+ ],
+ "conservation-area": ["reference", "geometry", "name"],
+ "conservation-area-document": [
+ "reference", "name", "conservation-area",
+ "document-url", "documentation-url", "document-type",
+ ],
+ "tree-preservation-order": [
+ "reference", "document-url", "documentation-url",
+ ],
+ "tree-preservation-zone": ["reference", "geometry"],
+ "listed-building-outline": ["reference", "geometry", "name", "listed-building"],
+ "tree": ["reference", "point", "geometry"],
+ "brownfield-land": [
+ "OrganisationURI", "SiteReference", "SiteNameAddress", "GeoX", "GeoY",
+ ],
+}
+
+FAR_FUTURE_YEARS_AHEAD = 50
+
+
+class HarmonisePhase(PolarsPhase):
+ """
+ Harmonise field values according to their datatype specification.
+
+ This phase delegates to the existing datatype normalisation logic on a
+ per-row basis using map_elements for correctness, since individual
+ datatype classes contain complex transformation rules.
+ """
+
+ def __init__(
+ self,
+ field_datatype_map=None,
+ issues=None,
+ dataset=None,
+ valid_category_values=None,
+ ):
+ if field_datatype_map is None:
+ field_datatype_map = {}
+ if valid_category_values is None:
+ valid_category_values = {}
+ self.field_datatype_map = field_datatype_map
+ self.issues = issues
+ self.dataset = dataset
+ self.valid_category_values = valid_category_values
+
+ def _get_far_future_date(self, number_of_years_ahead: int):
+ today = date.today()
+ y = today.year + number_of_years_ahead
+ last_day = monthrange(y, today.month)[1]
+ day = min(today.day, last_day)
+ return today.replace(year=y, day=day)
+
+ def _harmonise_row(self, row_dict, resource, line_number, entry_number):
+ """Harmonise a single row ā mirrors the streaming HarmonisePhase exactly."""
+ if self.issues:
+ self.issues.resource = resource
+ self.issues.line_number = line_number
+ self.issues.entry_number = entry_number
+
+ o = {}
+ for field, value in row_dict.items():
+ if field.startswith("__"):
+ continue
+
+ # Category value validation
+ if field in self.valid_category_values:
+ if value:
+ normalised_value = value.replace(" ", "-")
+ matching_value = next(
+ (
+ v
+ for v in self.valid_category_values[field]
+ if v.lower() == normalised_value.lower()
+ ),
+ None,
+ )
+ if matching_value:
+ value = matching_value
+ else:
+ if self.issues:
+ self.issues.log_issue(
+ field, "invalid category value", value
+ )
+
+ # Harmonise via datatype
+ if not value:
+ o[field] = ""
+ elif field in self.field_datatype_map:
+ if self.issues:
+ self.issues.fieldname = field
+ datatype_name = self.field_datatype_map[field]
+ if datatype_name == "datetime":
+ far_past_date = date(1799, 12, 31)
+ far_future_date = self._get_far_future_date(FAR_FUTURE_YEARS_AHEAD)
+ datatype = datatype_factory(
+ datatype_name=datatype_name,
+ far_past_date=far_past_date,
+ far_future_date=far_future_date,
+ )
+ else:
+ datatype = datatype_factory(datatype_name=datatype_name)
+ o[field] = datatype.normalise(value, issues=self.issues)
+ else:
+ o[field] = value
+
+ # Future entry-date check
+ for field in ["entry-date", "LastUpdatedDate"]:
+ val = o.get(field, "")
+ if val:
+ try:
+ if datetime.strptime(val[:10], "%Y-%m-%d").date() > datetime.today().date():
+ if self.issues:
+ self.issues.log_issue(
+ field, "future entry-date", row_dict.get(field, ""),
+ f"{field} must be today or in the past",
+ )
+ o[field] = ""
+ except (ValueError, TypeError):
+ pass
+
+ # GeoX/GeoY handling
+ if "GeoX" in row_dict and "GeoY" in row_dict:
+ if self.issues:
+ self.issues.fieldname = "GeoX,GeoY"
+ point = PointDataType()
+ try:
+ geometry = point.normalise(
+ [o.get("GeoX", ""), o.get("GeoY", "")],
+ issues=self.issues,
+ )
+ if geometry and shapely:
+ point_geometry = shapely.wkt.loads(geometry)
+ x, y = point_geometry.coords[0]
+ o["GeoX"] = str(x)
+ o["GeoY"] = str(y)
+ elif not geometry:
+ o.pop("GeoX", None)
+ o.pop("GeoY", None)
+ except Exception as e:
+ logger.error(
+ f"Exception occurred while fetching geoX, geoY coordinates: {e}"
+ )
+
+ # Typology prefix
+ for typology in ["organisation", "geography", "document"]:
+ value = o.get(typology, "")
+ if value and ":" not in value:
+ o[typology] = f"{self.dataset}:{value}"
+
+ # Mandatory field checks
+ mandatory_fields = MANDATORY_FIELDS_DICT.get(self.dataset)
+ for field in row_dict:
+ if field.startswith("__"):
+ continue
+ if field in ["geometry", "point"]:
+ if not row_dict.get("geometry") and not row_dict.get("point"):
+ if self.issues:
+ self.issues.log_issue(
+ field, "missing value", "", f"{field} missing"
+ )
+ elif mandatory_fields and field in mandatory_fields:
+ if not row_dict.get(field):
+ if self.issues:
+ self.issues.log_issue(
+ field, "missing value", "", f"{field} missing"
+ )
+
+ # Wikipedia
+ if row_dict.get("wikipedia", "").startswith("http"):
+ if self.issues:
+ self.issues.log_issue(
+ "wikipedia", "removed URI prefix", row_dict["wikipedia"]
+ )
+ o["wikipedia"] = row_dict["wikipedia"].replace(
+ "https://en.wikipedia.org/wiki/", ""
+ )
+
+ return o
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ meta_cols = [c for c in df.columns if c.startswith("__")]
+ data_cols = [c for c in df.columns if not c.startswith("__")]
+
+ results = []
+ for row in df.iter_rows(named=True):
+ resource = row.get("__resource", "")
+ line_number = row.get("__line_number", 0)
+ entry_number = row.get("__entry_number", 0)
+
+ harmonised = self._harmonise_row(row, resource, line_number, entry_number)
+
+ # Include metadata
+ out = {}
+ for mc in meta_cols:
+ out[mc] = row[mc]
+ for field in data_cols:
+ out[field] = harmonised.get(field, "")
+ results.append(out)
+
+ if not results:
+ return df.clear()
+
+ return pl.DataFrame(results, schema={c: pl.Utf8 for c in results[0] if not c.startswith("__")} | {c: df.schema[c] for c in meta_cols if c in df.schema})
diff --git a/digital_land/phase_polars/load.py b/digital_land/phase_polars/load.py
new file mode 100644
index 00000000..b0571bc0
--- /dev/null
+++ b/digital_land/phase_polars/load.py
@@ -0,0 +1,39 @@
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class LoadPhase(PolarsPhase):
+ """
+ Load a CSV file into a Polars DataFrame.
+ """
+
+ def __init__(self, path=None, resource=None, dataset=None):
+ self.path = path
+ self.resource = resource
+ self.dataset = dataset
+
+ def process(self, df=None):
+ from pathlib import Path
+
+ path = self.path
+ resource = self.resource or (Path(path).stem if path else None)
+
+ result = pl.read_csv(
+ str(path),
+ infer_schema_length=0,
+ null_values=[""],
+ truncate_ragged_lines=True,
+ ignore_errors=True,
+ )
+ result = result.with_columns(pl.all().cast(pl.Utf8).fill_null(""))
+
+ n = result.height
+ result = result.with_columns(
+ pl.lit(resource or "").alias("__resource"),
+ pl.arange(2, n + 2).alias("__line_number"),
+ pl.arange(1, n + 1).alias("__entry_number"),
+ pl.lit(str(path) if path else "").alias("__path"),
+ )
+
+ return result
diff --git a/digital_land/phase_polars/load/__init__.py b/digital_land/phase_polars/load/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/load/save_database.py b/digital_land/phase_polars/load/save_database.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/load/save_file.py b/digital_land/phase_polars/load/save_file.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/lookup.py b/digital_land/phase_polars/lookup.py
new file mode 100644
index 00000000..4abed653
--- /dev/null
+++ b/digital_land/phase_polars/lookup.py
@@ -0,0 +1,327 @@
+import re
+import logging
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+normalise_pattern = re.compile(r"[^a-z0-9-]")
+
+
+def normalise(value):
+ return re.sub(normalise_pattern, "", value.lower())
+
+
+def key(entry_number="", prefix="", reference="", organisation=""):
+ entry_number = str(entry_number)
+ prefix = normalise(prefix)
+ reference = normalise(reference)
+ organisation = normalise(organisation)
+ return ",".join([entry_number, prefix, reference, organisation])
+
+
+class EntityLookupPhase(PolarsPhase):
+ """
+ Look up entity numbers by CURIE (prefix:reference).
+ """
+
+ def __init__(
+ self,
+ lookups=None,
+ redirect_lookups=None,
+ issue_log=None,
+ operational_issue_log=None,
+ entity_range=None,
+ ):
+ if lookups is None:
+ lookups = {}
+ if redirect_lookups is None:
+ redirect_lookups = {}
+ self.lookups = lookups
+ self.redirect_lookups = redirect_lookups
+ self.issues = issue_log
+ self.operational_issues = operational_issue_log
+ self.entity_range = entity_range or []
+
+ def _lookup(self, prefix="", reference="", organisation="", entry_number=""):
+ return (
+ self.lookups.get(
+ key(prefix=prefix, entry_number=entry_number), ""
+ )
+ or self.lookups.get(
+ key(prefix=prefix, organisation=organisation, reference=reference), ""
+ )
+ or self.lookups.get(
+ key(prefix=prefix, reference=reference), ""
+ )
+ )
+
+ def _redirect(self, entity):
+ if self.redirect_lookups and entity:
+ redirect_entry = self.redirect_lookups.get(str(entity), "")
+ if redirect_entry:
+ if redirect_entry["status"] == "301":
+ return redirect_entry["entity"]
+ elif redirect_entry["status"] == "410":
+ return ""
+ return entity
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ if "entity" not in df.columns:
+ df = df.with_columns(pl.lit("").alias("entity"))
+ if "prefix" not in df.columns:
+ df = df.with_columns(pl.lit("").alias("prefix"))
+ if "reference" not in df.columns:
+ df = df.with_columns(pl.lit("").alias("reference"))
+
+ entities = []
+ for row in df.iter_rows(named=True):
+ existing = row.get("entity", "") or ""
+ prefix = row.get("prefix", "") or ""
+ reference = row.get("reference", "") or ""
+ organisation = (row.get("organisation", "") or "").replace(
+ "local-authority-eng", "local-authority"
+ )
+ entry_number = row.get("__entry_number", "")
+ line_number = row.get("__line_number", "")
+ resource = row.get("__resource", "")
+
+ if existing:
+ entities.append(existing)
+ continue
+
+ if not prefix:
+ entities.append("")
+ continue
+
+ entity = self._lookup(
+ prefix=prefix,
+ reference=reference,
+ organisation=organisation,
+ entry_number=entry_number,
+ )
+
+ if entity and self.entity_range:
+ try:
+ if int(entity) not in range(
+ int(self.entity_range[0]), int(self.entity_range[1])
+ ):
+ if self.issues:
+ self.issues.resource = resource
+ self.issues.line_number = line_number
+ self.issues.entry_number = entry_number
+ self.issues.log_issue(
+ "entity", "entity number out of range", entity
+ )
+ except (ValueError, TypeError):
+ pass
+
+ if not entity:
+ curie = f"{prefix}:{reference}"
+ if self.issues:
+ self.issues.resource = resource
+ self.issues.line_number = line_number
+ self.issues.entry_number = entry_number
+ if not reference:
+ self.issues.log_issue(
+ "entity",
+ "unknown entity - missing reference",
+ curie,
+ line_number=line_number,
+ )
+ else:
+ self.issues.log_issue(
+ "entity",
+ "unknown entity",
+ curie,
+ line_number=line_number,
+ )
+ if self.operational_issues:
+ self.operational_issues.log_issue(
+ "entity",
+ "unknown entity",
+ curie,
+ line_number=line_number,
+ )
+ entities.append("")
+ else:
+ entity = self._redirect(entity)
+ entities.append(entity)
+
+ df = df.with_columns(pl.Series("entity", entities))
+
+ # Record entity map for issue log
+ if self.issues:
+ for row in df.iter_rows(named=True):
+ entry_number = row.get("__entry_number", "")
+ entity = row.get("entity", "")
+ if entity:
+ self.issues.record_entity_map(entry_number, entity)
+
+ return df
+
+
+class FactLookupPhase(PolarsPhase):
+ """
+ Look up reference-entity for facts.
+ """
+
+ def __init__(
+ self,
+ lookups=None,
+ redirect_lookups=None,
+ issue_log=None,
+ odp_collections=None,
+ ):
+ if lookups is None:
+ lookups = {}
+ if redirect_lookups is None:
+ redirect_lookups = {}
+ if odp_collections is None:
+ odp_collections = []
+ self.lookups = lookups
+ self.redirect_lookups = redirect_lookups
+ self.issues = issue_log
+ self.odp_collections = odp_collections
+
+ def _lookup(self, prefix="", reference="", organisation=""):
+ return (
+ self.lookups.get(
+ key(prefix=prefix, organisation=organisation, reference=reference), ""
+ )
+ or self.lookups.get(
+ key(prefix=prefix, reference=reference), ""
+ )
+ )
+
+ def _check_associated_organisation(self, entity):
+ reverse_lookups = {}
+ for k, v in self.lookups.items():
+ if v not in reverse_lookups:
+ reverse_lookups[v] = []
+ reverse_lookups[v].append(k)
+
+ if entity in reverse_lookups:
+ keywords = {"authority", "development", "government"}
+ for k in reverse_lookups[entity]:
+ parts = k.split(",")
+ if len(parts) > 3 and any(kw in parts[3] for kw in keywords):
+ return ""
+ return entity
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ if "reference-entity" not in df.columns:
+ df = df.with_columns(pl.lit("").alias("reference-entity"))
+
+ ref_entities = []
+ for row in df.iter_rows(named=True):
+ prefix = row.get("prefix", "") or ""
+ reference = row.get("reference", "") or ""
+ entity_number = row.get("entity", "") or ""
+ line_number = row.get("line-number", "") or row.get("__line_number", "")
+ organisation = (row.get("organisation", "") or "").replace(
+ "local-authority-eng", "local-authority"
+ )
+
+ if not (prefix and reference and entity_number):
+ ref_entities.append(row.get("reference-entity", "") or "")
+ continue
+
+ find_entity = self._lookup(
+ prefix=prefix, organisation=organisation, reference=reference
+ )
+ if not find_entity:
+ find_entity = self._lookup(prefix=prefix, reference=reference)
+ find_entity = self._check_associated_organisation(find_entity)
+
+ if not find_entity or (
+ str(find_entity) in self.redirect_lookups
+ and int(self.redirect_lookups[str(find_entity)].get("status", 0)) == 410
+ ):
+ if self.odp_collections and prefix in self.odp_collections:
+ if self.issues:
+ self.issues.log_issue(
+ prefix,
+ "missing associated entity",
+ reference,
+ line_number=line_number,
+ )
+ ref_entities.append("")
+ else:
+ ref_entities.append(str(find_entity))
+
+ df = df.with_columns(pl.Series("reference-entity", ref_entities))
+ return df
+
+
+class PrintLookupPhase(PolarsPhase):
+ """
+ Print new lookup entries for unresolved entities.
+ """
+
+ def __init__(self, lookups=None, redirect_lookups=None):
+ if lookups is None:
+ lookups = {}
+ if redirect_lookups is None:
+ redirect_lookups = {}
+ self.lookups = lookups
+ self.redirect_lookups = redirect_lookups
+ self.new_lookup_entries = []
+
+ def _lookup(self, prefix="", reference="", organisation="", entry_number=""):
+ return (
+ self.lookups.get(
+ key(prefix=prefix, entry_number=entry_number), ""
+ )
+ or self.lookups.get(
+ key(prefix=prefix, organisation=organisation, reference=reference), ""
+ )
+ or self.lookups.get(
+ key(prefix=prefix, reference=reference), ""
+ )
+ )
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ for row in df.iter_rows(named=True):
+ prefix = row.get("prefix", "") or ""
+ organisation = row.get("organisation", "") or ""
+ reference = row.get("reference", "") or ""
+ entry_number = row.get("__entry_number", "")
+
+ entity = ""
+ if prefix:
+ entity = self._lookup(
+ prefix=prefix,
+ reference=reference,
+ organisation=organisation,
+ entry_number=entry_number,
+ )
+
+ if not entity:
+ if prefix and organisation and reference:
+ if "," in reference:
+ reference = f'"{reference}"'
+ new_lookup = {
+ "prefix": prefix,
+ "organisation": organisation,
+ "reference": reference,
+ }
+ self.new_lookup_entries.append([new_lookup])
+ elif not reference:
+ logging.info(
+ "No reference found for entry: "
+ + str(entry_number)
+ + " in resource: "
+ + row.get("__resource", "")
+ )
+
+ return df
diff --git a/digital_land/phase_polars/map.py b/digital_land/phase_polars/map.py
new file mode 100644
index 00000000..0fb0cc0b
--- /dev/null
+++ b/digital_land/phase_polars/map.py
@@ -0,0 +1,93 @@
+import re
+
+import polars as pl
+
+from ..log import ColumnFieldLog
+from .phase import PolarsPhase
+
+normalise_pattern = re.compile(r"[^a-z0-9-_]")
+
+
+def normalise(name):
+ new_name = name.replace("_", "-")
+ return re.sub(normalise_pattern, "", new_name.lower())
+
+
+class MapPhase(PolarsPhase):
+ """
+ Rename columns according to the column map and specification fieldnames.
+ """
+
+ def __init__(self, fieldnames, columns=None, log=None):
+ if columns is None:
+ columns = {}
+ self.columns = columns
+ self.normalised_fieldnames = {normalise(f): f for f in fieldnames}
+ if not log:
+ log = ColumnFieldLog()
+ self.log = log
+
+ def headers(self, column_names):
+ """Build the header mapping (column_name ā field_name)."""
+ headers = {}
+ matched = []
+
+ for header in sorted(column_names):
+ fieldname = normalise(header)
+ for pattern, value in self.columns.items():
+ if fieldname == pattern:
+ matched.append(value)
+ headers[header] = value
+
+ for header in sorted(column_names):
+ if header in headers:
+ continue
+ fieldname = normalise(header)
+ if fieldname not in matched and fieldname in self.normalised_fieldnames:
+ headers[header] = self.normalised_fieldnames[fieldname]
+
+ if {"GeoX", "Easting"} <= headers.keys():
+ item = headers.pop("GeoX")
+ headers["GeoX"] = item
+
+ if {"GeoY", "Northing"} <= headers.keys():
+ item = headers.pop("GeoY")
+ headers["GeoY"] = item
+
+ return headers
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ data_cols = [c for c in df.columns if not c.startswith("__")]
+ header_map = self.headers(data_cols)
+
+ # Log headers
+ for col, field in header_map.items():
+ self.log.add(column=col, field=field)
+
+ # Select only mapped columns (drop unmapped data cols), keep metadata
+ meta_cols = [c for c in df.columns if c.startswith("__")]
+
+ select_exprs = []
+ for col, field in header_map.items():
+ if field == "IGNORE":
+ continue
+ select_exprs.append(pl.col(col).fill_null("").alias(field))
+
+ # Add metadata columns
+ for mc in meta_cols:
+ select_exprs.append(pl.col(mc))
+
+ # Handle duplicate target field names - if multiple columns map to the same
+ # field, keep the last one (matching original generator behaviour)
+ seen = {}
+ unique_exprs = []
+ for expr in select_exprs:
+ # Get the output name from the expression
+ name = expr.meta.output_name()
+ seen[name] = expr
+ unique_exprs = list(seen.values())
+
+ return df.select(unique_exprs)
diff --git a/digital_land/phase_polars/migrate.py b/digital_land/phase_polars/migrate.py
new file mode 100644
index 00000000..20ece215
--- /dev/null
+++ b/digital_land/phase_polars/migrate.py
@@ -0,0 +1,64 @@
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class MigratePhase(PolarsPhase):
+ """
+ Rename fields to match the latest specification.
+ """
+
+ def __init__(self, fields, migrations):
+ self.migrations = migrations
+ self.fields = list(
+ set(fields + ["entity", "organisation", "prefix", "reference"])
+ )
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ meta_cols = [c for c in df.columns if c.startswith("__")]
+ data_cols = [c for c in df.columns if not c.startswith("__")]
+
+ exprs = []
+ for field in self.fields:
+ migrated_from = self.migrations.get(field)
+ if migrated_from and migrated_from in df.columns:
+ exprs.append(pl.col(migrated_from).alias(field))
+ elif field in df.columns:
+ exprs.append(pl.col(field))
+ # else: field not present in df, skip
+
+ # Handle GeoX/GeoY ā point conversion
+ has_geoxy = "GeoX" in df.columns and "GeoY" in df.columns
+ if has_geoxy and "point" in self.fields:
+ exprs.append(
+ pl.when(
+ pl.col("GeoX").is_not_null()
+ & (pl.col("GeoX") != "")
+ & pl.col("GeoY").is_not_null()
+ & (pl.col("GeoY") != "")
+ )
+ .then(
+ pl.concat_str(
+ [pl.lit("POINT("), pl.col("GeoX"), pl.lit(" "), pl.col("GeoY"), pl.lit(")")],
+ separator="",
+ )
+ )
+ .otherwise(pl.lit(""))
+ .alias("point")
+ )
+
+ # Add metadata columns
+ for mc in meta_cols:
+ exprs.append(pl.col(mc))
+
+ # Deduplicate by alias (keep last in case of conflict, e.g. point)
+ seen = {}
+ for expr in exprs:
+ name = expr.meta.output_name()
+ seen[name] = expr
+ exprs = list(seen.values())
+
+ return df.select(exprs)
diff --git a/digital_land/phase_polars/normalise.py b/digital_land/phase_polars/normalise.py
new file mode 100644
index 00000000..cd03f278
--- /dev/null
+++ b/digital_land/phase_polars/normalise.py
@@ -0,0 +1,84 @@
+import csv
+import os
+import re
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+patch_dir = os.path.join(os.path.dirname(__file__), "../patch")
+
+
+class NormalisePhase(PolarsPhase):
+ """
+ Normalise CSV whitespace, strip null patterns and skip matching rows.
+
+ In the streaming pipeline this operates on raw lines *before* parsing.
+ In the Polars pipeline it operates on already-parsed string columns
+ which gives equivalent results.
+ """
+
+ null_path = os.path.join(patch_dir, "null.csv")
+
+ def __init__(self, skip_patterns=None):
+ if skip_patterns is None:
+ skip_patterns = []
+ self.skip_patterns = [re.compile(p) for p in skip_patterns]
+
+ self.null_patterns = []
+ if os.path.exists(self.null_path):
+ for row in csv.DictReader(open(self.null_path, newline="")):
+ self.null_patterns.append(re.compile(row["pattern"]))
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ # Identify data columns (non-metadata)
+ data_cols = [c for c in df.columns if not c.startswith("__")]
+
+ # Strip whitespace from all data columns
+ strip_exprs = [
+ pl.col(c)
+ .str.strip_chars()
+ .str.replace_all(r"\r", "")
+ .str.replace_all(r"\n", "\r\n")
+ .alias(c)
+ for c in data_cols
+ ]
+ if strip_exprs:
+ df = df.with_columns(strip_exprs)
+
+ # Apply null patterns to all data columns
+ for pattern in self.null_patterns:
+ null_exprs = [
+ pl.col(c).str.replace_all(pattern.pattern, "").alias(c)
+ for c in data_cols
+ ]
+ if null_exprs:
+ df = df.with_columns(null_exprs)
+
+ # Remove completely blank rows (all data columns empty or null)
+ if data_cols:
+ not_blank = pl.lit(False)
+ for c in data_cols:
+ not_blank = not_blank | (
+ pl.col(c).is_not_null() & (pl.col(c) != "")
+ )
+ df = df.filter(not_blank)
+
+ # Skip rows matching skip patterns (matched against full comma-joined line)
+ if self.skip_patterns and data_cols:
+ concat_expr = pl.concat_str(
+ [pl.col(c).fill_null("") for c in data_cols], separator=","
+ ).alias("__skip_line")
+ df = df.with_columns(concat_expr)
+
+ for pattern in self.skip_patterns:
+ df = df.filter(
+ ~pl.col("__skip_line").str.contains(pattern.pattern)
+ )
+
+ df = df.drop("__skip_line")
+
+ return df
diff --git a/digital_land/phase_polars/organisation.py b/digital_land/phase_polars/organisation.py
new file mode 100644
index 00000000..1c0f8cce
--- /dev/null
+++ b/digital_land/phase_polars/organisation.py
@@ -0,0 +1,52 @@
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class OrganisationPhase(PolarsPhase):
+ """
+ Look up the organisation value.
+ """
+
+ def __init__(self, organisation=None, issues=None):
+ self.organisation = organisation
+ self.issues = issues
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ if "organisation" not in df.columns:
+ df = df.with_columns(pl.lit("").alias("organisation"))
+
+ if self.organisation is None:
+ return df
+
+ # Apply organisation lookup row-by-row (lookup may be complex)
+ def _lookup(val):
+ result = self.organisation.lookup(val if val else "")
+ return result if result else ""
+
+ df = df.with_columns(
+ pl.col("organisation")
+ .map_elements(_lookup, return_dtype=pl.Utf8)
+ .alias("__org_resolved")
+ )
+
+ # Log issues for rows where organisation could not be resolved
+ if self.issues:
+ for row in df.filter(pl.col("__org_resolved") == "").iter_rows(named=True):
+ org_val = row.get("organisation", "")
+ if org_val:
+ self.issues.resource = row.get("__resource", "")
+ self.issues.line_number = row.get("__line_number", 0)
+ self.issues.entry_number = row.get("__entry_number", 0)
+ self.issues.log_issue(
+ "organisation", "invalid organisation", org_val
+ )
+
+ df = df.with_columns(
+ pl.col("__org_resolved").alias("organisation")
+ ).drop("__org_resolved")
+
+ return df
diff --git a/digital_land/phase_polars/patch.py b/digital_land/phase_polars/patch.py
new file mode 100644
index 00000000..fa031e36
--- /dev/null
+++ b/digital_land/phase_polars/patch.py
@@ -0,0 +1,87 @@
+import re
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class PatchPhase(PolarsPhase):
+ """
+ Apply regex patches to field values.
+ """
+
+ def __init__(self, issues=None, patches=None):
+ if patches is None:
+ patches = {}
+ self.issues = issues
+ self.patches = patches
+
+ def _apply_patch_value(self, fieldname, value):
+ """Apply patch to a single value ā mirrors streaming logic exactly."""
+ patches = {**self.patches.get(fieldname, {}), **self.patches.get("", {})}
+ for pattern, replacement in patches.items():
+ original_pattern = pattern
+ if pattern == value:
+ pattern = f"^{re.escape(pattern)}$"
+ match = re.match(pattern, value, flags=re.IGNORECASE)
+ if match:
+ newvalue = match.expand(replacement)
+ if newvalue != value:
+ if self.issues:
+ self.issues.log_issue(fieldname, "patch", value)
+ return newvalue
+ return value
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0 or not self.patches:
+ return df
+
+ data_cols = [c for c in df.columns if not c.startswith("__")]
+
+ # Determine which fields have patches
+ patched_fields = set(self.patches.keys()) - {""}
+ global_patches = self.patches.get("", {})
+
+ fields_to_patch = set()
+ for col in data_cols:
+ if col in patched_fields or global_patches:
+ fields_to_patch.add(col)
+
+ if not fields_to_patch:
+ return df
+
+ # Use map_elements per field for correctness (regex expand logic is complex)
+ for field in fields_to_patch:
+ if field not in df.columns:
+ continue
+
+ field_patches = {
+ **self.patches.get(field, {}),
+ **self.patches.get("", {}),
+ }
+ if not field_patches:
+ continue
+
+ def make_patcher(fname, fpatch):
+ def _patch(val):
+ if val is None or val == "":
+ return val
+ for pattern, replacement in fpatch.items():
+ p = pattern
+ if p == val:
+ p = f"^{re.escape(p)}$"
+ m = re.match(p, val, flags=re.IGNORECASE)
+ if m:
+ newval = m.expand(replacement)
+ return newval
+ return val
+ return _patch
+
+ patcher = make_patcher(field, field_patches)
+ df = df.with_columns(
+ pl.col(field)
+ .map_elements(patcher, return_dtype=pl.Utf8)
+ .alias(field)
+ )
+
+ return df
diff --git a/digital_land/phase_polars/phase.py b/digital_land/phase_polars/phase.py
new file mode 100644
index 00000000..d9dbcbcb
--- /dev/null
+++ b/digital_land/phase_polars/phase.py
@@ -0,0 +1,14 @@
+import polars as pl
+
+
+class PolarsPhase:
+ """
+ A step in a Polars-based pipeline process.
+
+ Each phase takes a Polars DataFrame and returns a Polars DataFrame.
+ Metadata columns (prefixed with __) carry through the pipeline:
+ __resource, __line_number, __entry_number, __path, __dataset, __priority
+ """
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ return df
diff --git a/digital_land/phase_polars/pivot.py b/digital_land/phase_polars/pivot.py
new file mode 100644
index 00000000..4c6772c1
--- /dev/null
+++ b/digital_land/phase_polars/pivot.py
@@ -0,0 +1,70 @@
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class PivotPhase(PolarsPhase):
+ """
+ Unpivot entity rows into a series of facts (one row per field value).
+ """
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ meta_cols = [c for c in df.columns if c.startswith("__")]
+ data_cols = [c for c in df.columns if not c.startswith("__") and c != "entity"]
+
+ if "entity" not in df.columns:
+ return df
+
+ # We need to carry metadata and entity through the unpivot.
+ # Polars .unpivot() works on value columns.
+ # Build the result row-by-row for exact parity with the streaming version.
+ rows = []
+ for row in df.iter_rows(named=True):
+ entity = row.get("entity", "")
+ resource = row.get("__resource", "")
+ line_number = row.get("__line_number", 0)
+ entry_number = row.get("__entry_number", 0)
+ priority = row.get("__priority", 1)
+ entry_date = row.get("entry-date", "")
+
+ for field in sorted(data_cols):
+ value = row.get(field, "") or ""
+ rows.append(
+ {
+ "fact": "",
+ "entity": entity,
+ "field": field,
+ "value": value,
+ "priority": str(priority),
+ "resource": resource,
+ "line-number": str(line_number),
+ "entry-number": str(entry_number),
+ "entry-date": entry_date,
+ "__resource": resource,
+ "__line_number": line_number,
+ "__entry_number": entry_number,
+ }
+ )
+
+ if not rows:
+ return pl.DataFrame(
+ schema={
+ "fact": pl.Utf8,
+ "entity": pl.Utf8,
+ "field": pl.Utf8,
+ "value": pl.Utf8,
+ "priority": pl.Utf8,
+ "resource": pl.Utf8,
+ "line-number": pl.Utf8,
+ "entry-number": pl.Utf8,
+ "entry-date": pl.Utf8,
+ "__resource": pl.Utf8,
+ "__line_number": pl.Int64,
+ "__entry_number": pl.Int64,
+ }
+ )
+
+ return pl.DataFrame(rows)
diff --git a/digital_land/phase_polars/prefix.py b/digital_land/phase_polars/prefix.py
new file mode 100644
index 00000000..978337be
--- /dev/null
+++ b/digital_land/phase_polars/prefix.py
@@ -0,0 +1,30 @@
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class EntityPrefixPhase(PolarsPhase):
+ """
+ Ensure every entry has a prefix field.
+ """
+
+ def __init__(self, dataset=None):
+ self.dataset = dataset
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ if "prefix" not in df.columns:
+ df = df.with_columns(pl.lit(self.dataset).alias("prefix"))
+ else:
+ df = df.with_columns(
+ pl.when(
+ pl.col("prefix").is_null() | (pl.col("prefix") == "")
+ )
+ .then(pl.lit(self.dataset))
+ .otherwise(pl.col("prefix"))
+ .alias("prefix")
+ )
+
+ return df
diff --git a/digital_land/phase_polars/priority.py b/digital_land/phase_polars/priority.py
new file mode 100644
index 00000000..0a3f9af3
--- /dev/null
+++ b/digital_land/phase_polars/priority.py
@@ -0,0 +1,59 @@
+import logging
+
+import polars as pl
+
+from .phase import PolarsPhase
+from digital_land.configuration.main import Config
+
+
+class PriorityPhase(PolarsPhase):
+ """
+ Deduce the priority of each entry when assembling facts.
+ """
+
+ def __init__(self, config: Config = None, providers=None):
+ if providers is None:
+ providers = []
+ self.providers = providers
+ self.default_priority = 1
+ self.config = config
+ if not config:
+ logging.warning(
+ f"No config provided so priority defaults to {self.default_priority}"
+ )
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ if "entity" not in df.columns:
+ df = df.with_columns(pl.lit(self.default_priority).alias("__priority"))
+ return df
+
+ if self.config:
+ priorities = []
+ organisations = []
+ for row in df.iter_rows(named=True):
+ entity = row.get("entity", "")
+ authoritative_org = self.config.get_entity_organisation(entity)
+ if authoritative_org is not None:
+ if authoritative_org in self.providers:
+ priorities.append(2)
+ organisations.append(row.get("organisation", ""))
+ else:
+ priorities.append(self.default_priority)
+ organisations.append(authoritative_org)
+ else:
+ priorities.append(self.default_priority)
+ organisations.append(row.get("organisation", ""))
+
+ df = df.with_columns(
+ pl.Series("__priority", priorities),
+ pl.Series("organisation", organisations),
+ )
+ else:
+ df = df.with_columns(
+ pl.lit(self.default_priority).alias("__priority")
+ )
+
+ return df
diff --git a/digital_land/phase_polars/prune.py b/digital_land/phase_polars/prune.py
new file mode 100644
index 00000000..6d962cbb
--- /dev/null
+++ b/digital_land/phase_polars/prune.py
@@ -0,0 +1,86 @@
+import logging
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+logger = logging.getLogger(__name__)
+
+
+class FieldPrunePhase(PolarsPhase):
+ """
+ Reduce columns to only those specified for the dataset.
+ """
+
+ def __init__(self, fields):
+ self.fields = list(
+ set(fields + ["entity", "organisation", "prefix", "reference"])
+ )
+ logging.debug(f"pruning fields to {self.fields}")
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ meta_cols = [c for c in df.columns if c.startswith("__")]
+ keep = [c for c in self.fields if c in df.columns] + meta_cols
+ return df.select(keep)
+
+
+class EntityPrunePhase(PolarsPhase):
+ """
+ Remove entries with a missing entity value.
+ """
+
+ def __init__(self, issue_log=None, dataset_resource_log=None):
+ self.log = dataset_resource_log
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ if self.log:
+ self.log.entry_count = 0
+ return df
+
+ if "entity" not in df.columns:
+ if self.log:
+ self.log.entry_count = 0
+ return df
+
+ # Log skipped rows
+ missing = df.filter(
+ pl.col("entity").is_null() | (pl.col("entity") == "")
+ )
+ for row in missing.iter_rows(named=True):
+ resource = row.get("__resource", "")
+ prefix = row.get("prefix", "")
+ reference = row.get("reference", "")
+ curie = f"{prefix}:{reference}"
+ entry_number = row.get("__entry_number", "")
+ logger.info(f"{resource} row {entry_number}: missing entity for {curie}")
+
+ result = df.filter(
+ pl.col("entity").is_not_null() & (pl.col("entity") != "")
+ )
+
+ if self.log:
+ self.log.entry_count = result.height
+
+ return result
+
+
+class FactPrunePhase(PolarsPhase):
+ """
+ Remove facts with a missing value (except when field is end-date).
+ """
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ if "value" not in df.columns:
+ return df
+
+ return df.filter(
+ (pl.col("value").is_not_null() & (pl.col("value") != ""))
+ | (pl.col("field") == "end-date")
+ )
diff --git a/digital_land/phase_polars/reference.py b/digital_land/phase_polars/reference.py
new file mode 100644
index 00000000..6e729d8b
--- /dev/null
+++ b/digital_land/phase_polars/reference.py
@@ -0,0 +1,133 @@
+import re
+import logging
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+logger = logging.getLogger(__name__)
+
+curie_re = re.compile(r"(?P[A-Za-z0-9_-]+):(?P[A-Za-z0-9_-].*)$")
+
+
+def split_curie(value):
+ match = curie_re.match(value)
+ if not match:
+ return ("", value)
+ return (match.group("prefix"), match.group("reference"))
+
+
+class EntityReferencePhase(PolarsPhase):
+ """
+ Ensure each entry has prefix and reference fields derived from the reference column.
+ """
+
+ def __init__(self, dataset=None, prefix=None, issues=None):
+ self.dataset = dataset
+ self.prefix = prefix or dataset
+ self.issues = issues
+
+ def _process_row(self, row_dict):
+ reference_value = row_dict.get("reference", "") or row_dict.get(self.dataset, "") or ""
+ ref_prefix, reference = split_curie(reference_value)
+
+ if self.issues and ref_prefix:
+ self.issues.resource = row_dict.get("__resource", "")
+ self.issues.line_number = row_dict.get("__line_number", 0)
+ self.issues.entry_number = row_dict.get("__entry_number", 0)
+ self.issues.log_issue(
+ "reference",
+ "reference value contains reference_prefix",
+ ref_prefix,
+ f"Original reference split into prefix '{ref_prefix}' and reference '{reference}'",
+ )
+
+ if "UPRN" in ref_prefix:
+ ref_prefix = ""
+
+ prefix = row_dict.get("prefix", "") or ref_prefix or self.prefix
+ return prefix, reference
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ if "prefix" not in df.columns:
+ df = df.with_columns(pl.lit("").alias("prefix"))
+ if "reference" not in df.columns:
+ df = df.with_columns(pl.lit("").alias("reference"))
+
+ prefixes = []
+ references = []
+ for row in df.iter_rows(named=True):
+ p, r = self._process_row(row)
+ prefixes.append(p)
+ references.append(r)
+
+ df = df.with_columns(
+ pl.Series("prefix", prefixes),
+ pl.Series("reference", references),
+ )
+
+ return df
+
+
+class FactReferencePhase(PolarsPhase):
+ """
+ Ensure a fact which is a reference has prefix and reference fields.
+ """
+
+ def __init__(
+ self,
+ field_typology_map=None,
+ field_prefix_map=None,
+ ):
+ self.field_typology_map = field_typology_map or {}
+ self.field_prefix_map = field_prefix_map or {}
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if df is None or df.height == 0:
+ return df
+
+ if "prefix" not in df.columns:
+ df = df.with_columns(pl.lit("").alias("prefix"))
+ if "reference" not in df.columns:
+ df = df.with_columns(pl.lit("").alias("reference"))
+ if "field" not in df.columns or "value" not in df.columns:
+ return df
+
+ ref_typologies = {
+ "category", "document", "geography",
+ "organisation", "policy", "legal-instrument",
+ }
+
+ def _process(row_dict):
+ prefix = row_dict.get("prefix", "") or ""
+ reference = row_dict.get("reference", "") or ""
+
+ if prefix and reference:
+ return prefix, reference
+
+ field = row_dict.get("field", "")
+ typology = self.field_typology_map.get(field, "")
+
+ if typology in ref_typologies:
+ value_prefix, value_reference = split_curie(row_dict.get("value", "") or "")
+ prefix = prefix or value_prefix or self.field_prefix_map.get(field, field)
+ reference = reference or value_reference
+
+ return prefix, reference
+
+ prefixes = []
+ references = []
+ for row in df.iter_rows(named=True):
+ p, r = _process(row)
+ prefixes.append(p)
+ references.append(r)
+
+ df = df.with_columns(
+ pl.Series("prefix", prefixes),
+ pl.Series("reference", references),
+ )
+
+ return df
diff --git a/digital_land/phase_polars/save.py b/digital_land/phase_polars/save.py
new file mode 100644
index 00000000..f6322c92
--- /dev/null
+++ b/digital_land/phase_polars/save.py
@@ -0,0 +1,45 @@
+import csv
+import logging
+
+import polars as pl
+
+from .phase import PolarsPhase
+
+
+class SavePhase(PolarsPhase):
+ """
+ Save the DataFrame to a CSV file, then pass through.
+ """
+
+ def __init__(self, path=None, f=None, fieldnames=None, enabled=True):
+ self.path = path
+ self.f = f
+ self.fieldnames = fieldnames
+ self.enabled = enabled
+
+ def process(self, df: pl.DataFrame) -> pl.DataFrame:
+ if not self.enabled or df is None or df.height == 0:
+ return df
+
+ # Select only data columns (non-metadata)
+ data_cols = [c for c in df.columns if not c.startswith("__")]
+
+ if self.fieldnames:
+ # Only keep requested fieldnames that exist
+ keep = sorted([f for f in self.fieldnames if f in data_cols])
+ else:
+ keep = sorted(data_cols)
+
+ if not keep:
+ return df
+
+ out_df = df.select(keep)
+
+ if self.f:
+ # Write to file object
+ csv_str = out_df.write_csv()
+ self.f.write(csv_str)
+ elif self.path:
+ out_df.write_csv(str(self.path))
+
+ return df
diff --git a/digital_land/phase_polars/transform/__init__.py b/digital_land/phase_polars/transform/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/concat_field.py b/digital_land/phase_polars/transform/concat_field.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/convert.py b/digital_land/phase_polars/transform/convert.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/entity_lookup.py b/digital_land/phase_polars/transform/entity_lookup.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/entity_reference.py b/digital_land/phase_polars/transform/entity_reference.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/fact_hash.py b/digital_land/phase_polars/transform/fact_hash.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/field_prune.py b/digital_land/phase_polars/transform/field_prune.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/filter.py b/digital_land/phase_polars/transform/filter.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/flatten.py b/digital_land/phase_polars/transform/flatten.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/map.py b/digital_land/phase_polars/transform/map.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/migrate.py b/digital_land/phase_polars/transform/migrate.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/normalise.py b/digital_land/phase_polars/transform/normalise.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/parse.py b/digital_land/phase_polars/transform/parse.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/patch.py b/digital_land/phase_polars/transform/patch.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/pivot.py b/digital_land/phase_polars/transform/pivot.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/priority.py b/digital_land/phase_polars/transform/priority.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/resolve_organisation.py b/digital_land/phase_polars/transform/resolve_organisation.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/set_default.py b/digital_land/phase_polars/transform/set_default.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/digital_land/phase_polars/transform/validate.py b/digital_land/phase_polars/transform/validate.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/pyproject.toml b/pyproject.toml
index 19d458a4..dfeea6b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,6 +43,7 @@ dependencies = [
"boto3",
"moto",
"psutil",
+ "polars",
]
classifiers = [
diff --git a/test_polars_phases.py b/test_polars_phases.py
new file mode 100644
index 00000000..f5ff0bed
--- /dev/null
+++ b/test_polars_phases.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+"""
+Test script for Polars-based pipeline phases.
+
+Creates a simple CSV, runs each polars phase individually and in chain,
+and verifies the output matches expectations.
+"""
+
+import os
+import sys
+import tempfile
+import logging
+
+logging.basicConfig(level=logging.DEBUG, format="%(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+# āā Create test CSV āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+TEST_CSV_CONTENT = """\
+reference,name,geometry,documentation-url,start-date,organisation,entry-date
+ref-001,Test Area One,MULTIPOLYGON(((-0.1 51.5,-0.1 51.6,-0.2 51.6,-0.2 51.5,-0.1 51.5))),https://example.com/doc1,2024-01-15,local-authority-eng:example,2024-01-15
+ref-002,Test Area Two,MULTIPOLYGON(((-0.3 51.5,-0.3 51.6,-0.4 51.6,-0.4 51.5,-0.3 51.5))),https://example.com/doc2,2024-02-20,local-authority-eng:example,2024-02-20
+ref-003," Test Area Three ",MULTIPOLYGON(((-0.5 51.5,-0.5 51.6,-0.6 51.6,-0.6 51.5,-0.5 51.5))),https://example.com/doc3,2024-03-10,local-authority-eng:example,2024-03-10
+"""
+
+tmp_dir = tempfile.mkdtemp(prefix="polars_phases_test_")
+input_csv = os.path.join(tmp_dir, "test_input.csv")
+output_csv = os.path.join(tmp_dir, "test_output.csv")
+
+with open(input_csv, "w") as f:
+ f.write(TEST_CSV_CONTENT)
+
+print(f"Test data written to: {input_csv}")
+print(f"Output will go to: {output_csv}")
+
+# āā Import polars phases āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+import polars as pl
+
+from digital_land.phase_polars import (
+ run_polars_pipeline,
+ ConvertPhase,
+ NormalisePhase,
+ ConcatFieldPhase,
+ FilterPhase,
+ MapPhase,
+ PatchPhase,
+ HarmonisePhase,
+ DefaultPhase,
+ MigratePhase,
+ OrganisationPhase,
+ FieldPrunePhase,
+ EntityPrunePhase,
+ FactPrunePhase,
+ EntityReferencePhase,
+ EntityPrefixPhase,
+ EntityLookupPhase,
+ FactLookupPhase,
+ SavePhase,
+ PivotPhase,
+ FactCombinePhase,
+ FactorPhase,
+ PriorityPhase,
+ DumpPhase,
+ LoadPhase,
+)
+from digital_land.log import DatasetResourceLog, ConvertedResourceLog, ColumnFieldLog, IssueLog
+
+passed = 0
+failed = 0
+
+
+def check(name, condition, detail=""):
+ global passed, failed
+ if condition:
+ print(f" PASS: {name}")
+ passed += 1
+ else:
+ print(f" FAIL: {name} {detail}")
+ failed += 1
+
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 1: ConvertPhase ā loads CSV into DataFrame
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 1: ConvertPhase āā")
+dataset_resource_log = DatasetResourceLog()
+converted_resource_log = ConvertedResourceLog()
+convert_phase = ConvertPhase(
+ path=input_csv,
+ dataset_resource_log=dataset_resource_log,
+ converted_resource_log=converted_resource_log,
+)
+df = convert_phase.process()
+check("returns DataFrame", isinstance(df, pl.DataFrame))
+check("has 3 rows", df.height == 3, f"got {df.height}")
+check("has __resource column", "__resource" in df.columns)
+check("has __line_number column", "__line_number" in df.columns)
+check("has reference column", "reference" in df.columns)
+print(f" Columns: {[c for c in df.columns if not c.startswith('__')]}")
+print(f" Shape: {df.shape}")
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 2: NormalisePhase ā strips whitespace
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 2: NormalisePhase āā")
+normalise_phase = NormalisePhase(skip_patterns=[])
+df2 = normalise_phase.process(df)
+check("preserves row count", df2.height == 3)
+# Check that whitespace was stripped from " Test Area Three "
+names = df2["name"].to_list()
+check("whitespace stripped", "Test Area Three" in names, f"got {names}")
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 3: MapPhase ā renames columns
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 3: MapPhase āā")
+fieldnames = [
+ "reference", "name", "geometry", "documentation-url",
+ "start-date", "organisation", "entry-date", "point",
+ "entity", "prefix", "end-date",
+]
+column_field_log = ColumnFieldLog()
+map_phase = MapPhase(fieldnames=fieldnames, columns={}, log=column_field_log)
+df3 = map_phase.process(df2)
+check("preserves row count", df3.height == 3)
+check("has reference column", "reference" in df3.columns)
+data_cols = [c for c in df3.columns if not c.startswith("__")]
+print(f" Mapped columns: {data_cols}")
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 4: PatchPhase ā applies patches
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 4: PatchPhase āā")
+issue_log = IssueLog(dataset="test-dataset", resource="test-resource")
+patch_phase = PatchPhase(issues=issue_log, patches={})
+df4 = patch_phase.process(df3)
+check("no patches, same rows", df4.height == 3)
+
+# Test with actual patches
+patch_with_data = PatchPhase(
+ issues=issue_log,
+ patches={"name": {"Test Area One": "Patched Area One"}},
+)
+df4b = patch_with_data.process(df3)
+names_patched = df4b["name"].to_list()
+check("patch applied", "Patched Area One" in names_patched, f"got {names_patched}")
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 5: DefaultPhase ā applies defaults
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 5: DefaultPhase āā")
+default_phase = DefaultPhase(
+ issues=issue_log,
+ default_values={"end-date": ""},
+)
+# Add an empty end-date column
+df5_in = df4.with_columns(pl.lit("").alias("end-date"))
+df5 = default_phase.process(df5_in)
+check("preserves rows", df5.height == 3)
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 6: FilterPhase
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 6: FilterPhase āā")
+filter_phase = FilterPhase(filters={})
+df6 = filter_phase.process(df5)
+check("no filter, same rows", df6.height == 3)
+
+filter_with_data = FilterPhase(filters={"reference": "ref-001"})
+df6b = filter_with_data.process(df5)
+check("filter applied", df6b.height == 1, f"got {df6b.height}")
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 7: MigratePhase
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 7: MigratePhase āā")
+migrate_phase = MigratePhase(
+ fields=["reference", "name", "geometry", "documentation-url",
+ "start-date", "organisation", "entry-date", "end-date"],
+ migrations={},
+)
+df7 = migrate_phase.process(df6)
+check("preserves rows", df7.height == 3)
+check("has reference", "reference" in df7.columns)
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 8: EntityReferencePhase + EntityPrefixPhase
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 8: EntityReferencePhase + EntityPrefixPhase āā")
+ref_phase = EntityReferencePhase(dataset="test-dataset", prefix="test-dataset", issues=issue_log)
+df8 = ref_phase.process(df7)
+check("has prefix", "prefix" in df8.columns)
+check("has reference", "reference" in df8.columns)
+prefixes = df8["prefix"].to_list()
+check("prefix set", all(p == "test-dataset" for p in prefixes), f"got {prefixes}")
+
+prefix_phase = EntityPrefixPhase(dataset="test-dataset")
+df8b = prefix_phase.process(df8)
+check("prefix still set", all(p == "test-dataset" for p in df8b["prefix"].to_list()))
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 9: FieldPrunePhase
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 9: FieldPrunePhase āā")
+prune_phase = FieldPrunePhase(fields=["reference", "name", "geometry", "organisation"])
+df9 = prune_phase.process(df8b)
+data_cols9 = [c for c in df9.columns if not c.startswith("__")]
+check("pruned to expected fields", len(data_cols9) <= 8, f"got {data_cols9}")
+check("has reference", "reference" in df9.columns)
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 10: EntityLookupPhase
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 10: EntityLookupPhase āā")
+from digital_land.phase_polars.lookup import key as lookup_key
+lookups = {
+ lookup_key(prefix="test-dataset", reference="ref-001"): "1000001",
+ lookup_key(prefix="test-dataset", reference="ref-002"): "1000002",
+ lookup_key(prefix="test-dataset", reference="ref-003"): "1000003",
+}
+lookup_phase = EntityLookupPhase(
+ lookups=lookups,
+ redirect_lookups={},
+ issue_log=issue_log,
+ entity_range=[1000000, 2000000],
+)
+df10 = lookup_phase.process(df9)
+check("has entity column", "entity" in df10.columns)
+entities = df10["entity"].to_list()
+check("entities assigned", "1000001" in entities, f"got {entities}")
+check("all entities assigned", all(e for e in entities), f"got {entities}")
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 11: EntityPrunePhase
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 11: EntityPrunePhase āā")
+dataset_resource_log2 = DatasetResourceLog(dataset="test-dataset", resource="test-resource")
+entity_prune = EntityPrunePhase(dataset_resource_log=dataset_resource_log2)
+df11 = entity_prune.process(df10)
+check("all rows kept (all have entities)", df11.height == 3)
+check("entry count logged", dataset_resource_log2.entry_count == 3)
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 12: PriorityPhase
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 12: PriorityPhase āā")
+priority_phase = PriorityPhase(config=None, providers=[])
+df12 = priority_phase.process(df11)
+check("has __priority", "__priority" in df12.columns)
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 13: PivotPhase
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 13: PivotPhase āā")
+pivot_phase = PivotPhase()
+df13 = pivot_phase.process(df12)
+check("pivoted to facts", df13.height > 3, f"got {df13.height} rows (should be > 3)")
+check("has fact column", "fact" in df13.columns)
+check("has field column", "field" in df13.columns)
+check("has value column", "value" in df13.columns)
+print(f" Pivoted to {df13.height} fact rows from 3 entity rows")
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 14: FactorPhase
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 14: FactorPhase āā")
+factor_phase = FactorPhase()
+df14 = factor_phase.process(df13)
+facts = df14["fact"].to_list()
+non_empty_facts = [f for f in facts if f]
+check("fact hashes generated", len(non_empty_facts) > 0, f"got {len(non_empty_facts)}")
+check("fact is sha256 hex", len(non_empty_facts[0]) == 64 if non_empty_facts else False)
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 15: FactPrunePhase
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 15: FactPrunePhase āā")
+fact_prune = FactPrunePhase()
+df15 = fact_prune.process(df14)
+check("facts pruned (empty values removed)", df15.height <= df14.height)
+print(f" Before: {df14.height} ā After: {df15.height}")
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 16: SavePhase
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 16: SavePhase āā")
+save_phase = SavePhase(path=output_csv, fieldnames=["entity", "fact", "field", "value"])
+df16 = save_phase.process(df15)
+check("CSV file created", os.path.exists(output_csv))
+if os.path.exists(output_csv):
+ import csv
+ with open(output_csv) as f:
+ reader = csv.DictReader(f)
+ rows = list(reader)
+ check("CSV has rows", len(rows) > 0, f"got {len(rows)}")
+ check("CSV has entity column", "entity" in rows[0])
+ print(f" Saved {len(rows)} rows to {output_csv}")
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# TEST 17: run_polars_pipeline (chained execution)
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\nāā Test 17: run_polars_pipeline (chained) āā")
+chain_output = os.path.join(tmp_dir, "chain_output.csv")
+result_df = run_polars_pipeline(
+ ConvertPhase(path=input_csv),
+ NormalisePhase(),
+ MapPhase(fieldnames=fieldnames, columns={}),
+ FilterPhase(filters={}),
+ SavePhase(path=chain_output, enabled=True),
+)
+check("chain returns DataFrame", isinstance(result_df, pl.DataFrame))
+check("chain output file exists", os.path.exists(chain_output))
+if os.path.exists(chain_output):
+ result_check = pl.read_csv(chain_output)
+ check("chain output has 3 rows", result_check.height == 3, f"got {result_check.height}")
+
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+# SUMMARY
+# āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
+print("\n" + "=" * 70)
+print(f"RESULTS: {passed} passed, {failed} failed out of {passed + failed} checks")
+print("=" * 70)
+
+if failed > 0:
+ print("\nSome tests FAILED!")
+ sys.exit(1)
+else:
+ print("\nAll tests PASSED!")
+ sys.exit(0)