diff --git a/.gitignore b/.gitignore index a08611ff5..578c0446b 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ demodata/ .eggs *.gfs .venv +/venv .direnv var/cache /collection @@ -35,4 +36,16 @@ docs/modules.rst # don't store data folder for use as storage for notebooks notebooks/data/ -notebooks/.ipynb_checkpoints \ No newline at end of file +notebooks/.ipynb_checkpoints + +# local_testing +/local_testing/cache/ +/local_testing/converted/ +/local_testing/extracted/ +/local_testing/output/ +/local_testing/polars_phases/ +/local_testing/raw/ +/local_testing/reports/ +/local_testing/specification/ +/local_testing/venv/ + diff --git a/digital_land/commands.py b/digital_land/commands.py index 463f0f452..67bb4c140 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -59,6 +59,32 @@ from digital_land.phase.save import SavePhase from digital_land.pipeline import run_pipeline, Lookups, Pipeline from digital_land.pipeline.process import convert_tranformed_csv_to_pq +from digital_land.phase_polars import run_polars_pipeline +from digital_land.phase_polars import ( + ConvertPhase as PolarsConvertPhase, + NormalisePhase as PolarsNormalisePhase, + ConcatFieldPhase as PolarsConcatFieldPhase, + FilterPhase as PolarsFilterPhase, + MapPhase as PolarsMapPhase, + PatchPhase as PolarsPatchPhase, + HarmonisePhase as PolarsHarmonisePhase, + DefaultPhase as PolarsDefaultPhase, + MigratePhase as PolarsMigratePhase, + OrganisationPhase as PolarsOrganisationPhase, + FieldPrunePhase as PolarsFieldPrunePhase, + EntityPrunePhase as PolarsEntityPrunePhase, + FactPrunePhase as PolarsFactPrunePhase, + EntityReferencePhase as PolarsEntityReferencePhase, + FactReferencePhase as PolarsFactReferencePhase, + EntityPrefixPhase as PolarsEntityPrefixPhase, + EntityLookupPhase as PolarsEntityLookupPhase, + FactLookupPhase as PolarsFactLookupPhase, + SavePhase as PolarsSavePhase, + PivotPhase as PolarsPivotPhase, + FactCombinePhase as PolarsFactCombinePhase, + FactorPhase as PolarsFactorPhase, + PriorityPhase as PolarsPriorityPhase, +) from digital_land.schema import Schema from digital_land.update import add_source_endpoint from digital_land.configuration.main import Config @@ -237,6 +263,7 @@ def pipeline_run( resource=None, output_log_dir=None, converted_path=None, + use_polars=False, ): # set up paths cache_dir = Path(cache_dir) @@ -302,87 +329,168 @@ def pipeline_run( if "entry-date" not in default_values: default_values["entry-date"] = entry_date - # TODO Migrate all of this into a function in the Pipeline function - run_pipeline( - ConvertPhase( - path=input_path, - dataset_resource_log=dataset_resource_log, - converted_resource_log=converted_resource_log, - output_path=converted_path, - ), - NormalisePhase(skip_patterns=skip_patterns), - ParsePhase(), - ConcatFieldPhase(concats=concats, log=column_field_log), - FilterPhase(filters=pipeline.filters(resource)), - MapPhase( - fieldnames=intermediate_fieldnames, - columns=columns, - log=column_field_log, - ), - FilterPhase(filters=pipeline.filters(resource, endpoints=endpoints)), - PatchPhase( - issues=issue_log, - patches=patches, - ), - HarmonisePhase( - field_datatype_map=specification.get_field_datatype_map(), - issues=issue_log, - dataset=dataset, - valid_category_values=valid_category_values, - ), - DefaultPhase( - default_fields=default_fields, - default_values=default_values, - issues=issue_log, - ), - # TBD: move migrating columns to fields to be immediately after map - # this will simplify harmonisation and remove intermediate_fieldnames - # but effects brownfield-land and other pipelines which operate on columns - MigratePhase( - fields=specification.schema_field[schema], - migrations=pipeline.migrations(), - ), - OrganisationPhase(organisation=organisation, issues=issue_log), - FieldPrunePhase(fields=specification.current_fieldnames(schema)), - EntityReferencePhase( - dataset=dataset, - prefix=specification.dataset_prefix(dataset), - issues=issue_log, - ), - EntityPrefixPhase(dataset=dataset), - EntityLookupPhase( - lookups=lookups, - redirect_lookups=redirect_lookups, - issue_log=issue_log, - operational_issue_log=operational_issue_log, - entity_range=[entity_range_min, entity_range_max], - ), - SavePhase( - default_output_path("harmonised", input_path), - fieldnames=intermediate_fieldnames, - enabled=save_harmonised, - ), - EntityPrunePhase(dataset_resource_log=dataset_resource_log), - PriorityPhase(config=config, providers=organisations), - PivotPhase(), - FactCombinePhase(issue_log=issue_log, fields=combine_fields), - FactorPhase(), - FactReferencePhase( - field_typology_map=specification.get_field_typology_map(), - field_prefix_map=specification.get_field_prefix_map(), - ), - FactLookupPhase( - lookups=lookups, - redirect_lookups=redirect_lookups, - issue_log=issue_log, - odp_collections=specification.get_odp_collections(), - ), - FactPrunePhase(), - SavePhase( - output_path, - fieldnames=specification.factor_fieldnames(), - ), - ) + if use_polars: + # ── Polars-based pipeline ────────────────────────────────────────── + run_polars_pipeline( + PolarsConvertPhase( + path=input_path, + dataset_resource_log=dataset_resource_log, + converted_resource_log=converted_resource_log, + output_path=converted_path, + ), + PolarsNormalisePhase(skip_patterns=skip_patterns), + # ParsePhase is not needed – ConvertPhase already produces a DataFrame + PolarsConcatFieldPhase(concats=concats, log=column_field_log), + PolarsFilterPhase(filters=pipeline.filters(resource)), + PolarsMapPhase( + fieldnames=intermediate_fieldnames, + columns=columns, + log=column_field_log, + ), + PolarsFilterPhase(filters=pipeline.filters(resource, endpoints=endpoints)), + PolarsPatchPhase( + issues=issue_log, + patches=patches, + ), + PolarsHarmonisePhase( + field_datatype_map=specification.get_field_datatype_map(), + issues=issue_log, + dataset=dataset, + valid_category_values=valid_category_values, + ), + PolarsDefaultPhase( + default_fields=default_fields, + default_values=default_values, + issues=issue_log, + ), + PolarsMigratePhase( + fields=specification.schema_field[schema], + migrations=pipeline.migrations(), + ), + PolarsOrganisationPhase(organisation=organisation, issues=issue_log), + PolarsFieldPrunePhase(fields=specification.current_fieldnames(schema)), + PolarsEntityReferencePhase( + dataset=dataset, + prefix=specification.dataset_prefix(dataset), + issues=issue_log, + ), + PolarsEntityPrefixPhase(dataset=dataset), + PolarsEntityLookupPhase( + lookups=lookups, + redirect_lookups=redirect_lookups, + issue_log=issue_log, + operational_issue_log=operational_issue_log, + entity_range=[entity_range_min, entity_range_max], + ), + PolarsSavePhase( + default_output_path("harmonised", input_path), + fieldnames=intermediate_fieldnames, + enabled=save_harmonised, + ), + PolarsEntityPrunePhase(dataset_resource_log=dataset_resource_log), + PolarsPriorityPhase(config=config, providers=organisations), + PolarsPivotPhase(), + PolarsFactCombinePhase(issue_log=issue_log, fields=combine_fields), + PolarsFactorPhase(), + PolarsFactReferencePhase( + field_typology_map=specification.get_field_typology_map(), + field_prefix_map=specification.get_field_prefix_map(), + ), + PolarsFactLookupPhase( + lookups=lookups, + redirect_lookups=redirect_lookups, + issue_log=issue_log, + odp_collections=specification.get_odp_collections(), + ), + PolarsFactPrunePhase(), + PolarsSavePhase( + output_path, + fieldnames=specification.factor_fieldnames(), + ), + ) + else: + # ── Original streaming pipeline ──────────────────────────────────── + # TODO Migrate all of this into a function in the Pipeline function + run_pipeline( + ConvertPhase( + path=input_path, + dataset_resource_log=dataset_resource_log, + converted_resource_log=converted_resource_log, + output_path=converted_path, + ), + NormalisePhase(skip_patterns=skip_patterns), + ParsePhase(), + ConcatFieldPhase(concats=concats, log=column_field_log), + FilterPhase(filters=pipeline.filters(resource)), + MapPhase( + fieldnames=intermediate_fieldnames, + columns=columns, + log=column_field_log, + ), + FilterPhase(filters=pipeline.filters(resource, endpoints=endpoints)), + PatchPhase( + issues=issue_log, + patches=patches, + ), + HarmonisePhase( + field_datatype_map=specification.get_field_datatype_map(), + issues=issue_log, + dataset=dataset, + valid_category_values=valid_category_values, + ), + DefaultPhase( + default_fields=default_fields, + default_values=default_values, + issues=issue_log, + ), + # TBD: move migrating columns to fields to be immediately after map + # this will simplify harmonisation and remove intermediate_fieldnames + # but effects brownfield-land and other pipelines which operate on columns + MigratePhase( + fields=specification.schema_field[schema], + migrations=pipeline.migrations(), + ), + OrganisationPhase(organisation=organisation, issues=issue_log), + FieldPrunePhase(fields=specification.current_fieldnames(schema)), + EntityReferencePhase( + dataset=dataset, + prefix=specification.dataset_prefix(dataset), + issues=issue_log, + ), + EntityPrefixPhase(dataset=dataset), + EntityLookupPhase( + lookups=lookups, + redirect_lookups=redirect_lookups, + issue_log=issue_log, + operational_issue_log=operational_issue_log, + entity_range=[entity_range_min, entity_range_max], + ), + SavePhase( + default_output_path("harmonised", input_path), + fieldnames=intermediate_fieldnames, + enabled=save_harmonised, + ), + EntityPrunePhase(dataset_resource_log=dataset_resource_log), + PriorityPhase(config=config, providers=organisations), + PivotPhase(), + FactCombinePhase(issue_log=issue_log, fields=combine_fields), + FactorPhase(), + FactReferencePhase( + field_typology_map=specification.get_field_typology_map(), + field_prefix_map=specification.get_field_prefix_map(), + ), + FactLookupPhase( + lookups=lookups, + redirect_lookups=redirect_lookups, + issue_log=issue_log, + odp_collections=specification.get_odp_collections(), + ), + FactPrunePhase(), + SavePhase( + output_path, + fieldnames=specification.factor_fieldnames(), + ), + ) # In the FactCombinePhase, when combine_fields has some values, we check for duplicates and combine values. # If we have done this then we will not call duplicate_reference_check as we have already carried out a diff --git a/digital_land/phase_polars/README.md b/digital_land/phase_polars/README.md deleted file mode 100644 index 853f2fbf8..000000000 --- a/digital_land/phase_polars/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# Phases - -This directory contains transformation phases used in the digital-land data pipeline. Phases are modular processing steps that transform and validate data. - -## Transform Phases - -The `transform` folder contains the core data transformation phases executed in sequence: - -### Data Transformation Pipeline - -1. **01_convert.py** - Convert data types and formats -2. **02_normalise.py** - Normalize data values and structure -3. **03_parse.py** - Parse and extract data from raw inputs -4. **04_concat_field.py** - Concatenate multiple fields -5. **05_filter.py** - Filter records based on criteria -6. **06_map.py** - Map values between different formats -7. **07_patch.py** - Apply patches to data records -8. **08_validate.py** - Validate data against schema -9. **09_set_default.py** - Set default values for missing data -10. **10_migrate.py** - Migrate data structure/format -11. **11_resolve_organisation.py** - Resolve and enrich organisation references -12. **12_field_prune.py** - Remove unnecessary fields -13. **13_entity_reference.py** - Handle entity references -14. **14_entity_lookup.py** - Lookup and enrich entity data -15. **15_pivot.py** - Pivot data structure -16. **16_fact_hash.py** - Generate fact hashes for deduplication -17. **17_flatten.py** - Flatten nested data structures - -## Load Phases - -The `load` folder contains phases for saving and storing data: - -1. **01_save_file.py** - Save data to file storage -2. **02_save_database.py** - Save data to database - -## Overview - -Each phase is designed to be: -- **Modular** - Can be used independently or in sequence -- **Configurable** - Parameters can be customized via configuration -- **Reusable** - Shared across different pipelines and workflows diff --git a/digital_land/phase_polars/__init__.py b/digital_land/phase_polars/__init__.py index e69de29bb..50a7e1d56 100644 --- a/digital_land/phase_polars/__init__.py +++ b/digital_land/phase_polars/__init__.py @@ -0,0 +1,87 @@ +""" +Polars-based pipeline phases. + +Drop-in replacements for the streaming phases in `digital_land.phase`. +Each phase accepts and returns a `polars.DataFrame` instead of a generator. +""" + +import logging + +import polars as pl + +from .phase import PolarsPhase +from .convert import ConvertPhase +from .normalise import NormalisePhase +from .concat import ConcatFieldPhase +from .filter import FilterPhase +from .map import MapPhase +from .patch import PatchPhase +from .harmonise import HarmonisePhase +from .default import DefaultPhase +from .migrate import MigratePhase +from .organisation import OrganisationPhase +from .prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase +from .reference import EntityReferencePhase, FactReferencePhase +from .prefix import EntityPrefixPhase +from .lookup import EntityLookupPhase, FactLookupPhase, PrintLookupPhase +from .save import SavePhase +from .pivot import PivotPhase +from .combine import FactCombinePhase +from .factor import FactorPhase +from .priority import PriorityPhase +from .dump import DumpPhase +from .load import LoadPhase + +logger = logging.getLogger(__name__) + + +def run_polars_pipeline(*phases): + """ + Run a sequence of Polars phases. + + Each phase receives the DataFrame output of the previous phase. + The first phase typically starts from ``df=None`` and creates + the initial DataFrame (e.g. ConvertPhase). + """ + df = None + for phase in phases: + logger.debug(f"running polars phase {phase.__class__.__name__}") + df = phase.process(df) + if df is not None: + logger.debug( + f" -> {phase.__class__.__name__} produced {df.height} rows, " + f"{len([c for c in df.columns if not c.startswith('__')])} data cols" + ) + return df + + +__all__ = [ + "PolarsPhase", + "ConvertPhase", + "NormalisePhase", + "ConcatFieldPhase", + "FilterPhase", + "MapPhase", + "PatchPhase", + "HarmonisePhase", + "DefaultPhase", + "MigratePhase", + "OrganisationPhase", + "FieldPrunePhase", + "EntityPrunePhase", + "FactPrunePhase", + "EntityReferencePhase", + "FactReferencePhase", + "EntityPrefixPhase", + "EntityLookupPhase", + "FactLookupPhase", + "PrintLookupPhase", + "SavePhase", + "PivotPhase", + "FactCombinePhase", + "FactorPhase", + "PriorityPhase", + "DumpPhase", + "LoadPhase", + "run_polars_pipeline", +] diff --git a/digital_land/phase_polars/combine.py b/digital_land/phase_polars/combine.py new file mode 100644 index 000000000..db4d17da6 --- /dev/null +++ b/digital_land/phase_polars/combine.py @@ -0,0 +1,87 @@ +from copy import deepcopy + +import polars as pl + +from .phase import PolarsPhase + +try: + from shapely.ops import unary_union + from shapely.geometry import MultiPolygon + import shapely.wkt + from digital_land.datatype.wkt import dump_wkt + + HAS_SHAPELY = True +except ImportError: + HAS_SHAPELY = False + + +def combine_geometries(wkts, precision=6): + geometries = [shapely.wkt.loads(x) for x in wkts] + union = unary_union(geometries) + if not isinstance(union, MultiPolygon): + union = MultiPolygon([union]) + return dump_wkt(union, precision=precision) + + +class FactCombinePhase(PolarsPhase): + """ + Combine field values from multiple facts for the same entity. + """ + + def __init__(self, issue_log=None, fields=None): + if fields is None: + fields = {} + self.issues = issue_log + self.fields = fields + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0 or not self.fields: + return df + + if "field" not in df.columns or "entity" not in df.columns: + return df + + combine_field_names = set(self.fields.keys()) if isinstance(self.fields, dict) else set(self.fields) + + # Split into combinable and non-combinable + mask = pl.col("field").is_in(list(combine_field_names)) + pass_through = df.filter(~mask) + to_combine = df.filter(mask) + + if to_combine.height == 0: + return pass_through + + # Group by entity + field and combine values + combined_rows = [] + for (entity, field), group_df in to_combine.group_by(["entity", "field"]): + values = [ + v + for v in group_df["value"].to_list() + if v is not None and v != "" + ] + values = sorted(set(values)) + + if field == "geometry" and HAS_SHAPELY and values: + combined_value = combine_geometries(values) + elif isinstance(self.fields, dict) and field in self.fields: + separator = self.fields[field] + combined_value = separator.join(values) + else: + combined_value = ";".join(values) + + # Emit rows for each original row in the group + for row in group_df.iter_rows(named=True): + if self.issues: + self.issues.line_number = row.get("line-number", row.get("__line_number", "")) + self.issues.entry_number = row.get("entry-number", row.get("__entry_number", "")) + self.issues.log_issue(field, "combined-value", entity) + + new_row = dict(row) + new_row["value"] = combined_value + combined_rows.append(new_row) + + if combined_rows: + combined_df = pl.DataFrame(combined_rows, schema=df.schema) + return pl.concat([pass_through, combined_df]) + + return pass_through diff --git a/digital_land/phase_polars/concat.py b/digital_land/phase_polars/concat.py new file mode 100644 index 000000000..111c2bd9d --- /dev/null +++ b/digital_land/phase_polars/concat.py @@ -0,0 +1,90 @@ +import itertools + +import polars as pl + +from .phase import PolarsPhase + + +class ConcatFieldPhase(PolarsPhase): + """ + Concatenate multiple source fields into a single destination field. + """ + + def __init__(self, concats=None, log=None): + if concats is None: + concats = {} + self.concats = concats + + if log: + for fieldname, cat in self.concats.items(): + log.add( + fieldname, + cat["prepend"] + + cat["separator"].join(cat["fields"]) + + cat["append"], + ) + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0 or not self.concats: + return df + + for fieldname, cat in self.concats.items(): + prepend = cat["prepend"] + separator = cat["separator"] + append = cat["append"] + source_fields = cat["fields"] + + # Ensure the destination column exists + if fieldname not in df.columns: + df = df.with_columns(pl.lit("").alias(fieldname)) + + # Build list of expressions for values to concatenate + # Start with the existing field value, then add source fields + parts = [pl.col(fieldname).fill_null("")] + for h in source_fields: + if h in df.columns: + parts.append( + pl.when( + pl.col(h).is_not_null() + & (pl.col(h).str.strip_chars() != "") + ) + .then(pl.col(h)) + .otherwise(pl.lit(None)) + ) + + # Filter out nulls and join with separator, then wrap with prepend/append + def _concat_row(row_vals): + filtered = [v for v in row_vals if v is not None and v != ""] + body = separator.join(filtered) + return prepend + body + append + + # Use struct + map_elements for the concatenation logic + struct_cols = [] + temp_names = [] + for i, part in enumerate(parts): + name = f"__concat_part_{i}" + temp_names.append(name) + struct_cols.append(part.alias(name)) + + df = df.with_columns(struct_cols) + + df = df.with_columns( + pl.struct(temp_names) + .map_elements( + lambda s, sep=separator, pre=prepend, app=append: ( + pre + + sep.join( + v + for v in s.values() + if v is not None and str(v).strip() != "" + ) + + app + ), + return_dtype=pl.Utf8, + ) + .alias(fieldname) + ) + + df = df.drop(temp_names) + + return df diff --git a/digital_land/phase_polars/convert.py b/digital_land/phase_polars/convert.py new file mode 100644 index 000000000..35380defb --- /dev/null +++ b/digital_land/phase_polars/convert.py @@ -0,0 +1,226 @@ +import csv +import logging +import os +import tempfile +import time +from pathlib import Path + +import polars as pl + +from .phase import PolarsPhase +from ..phase.convert import ( + ConversionError, + convert_features_to_csv, + convert_json_to_csv, + detect_file_encoding, + read_csv, + read_excel, +) +from ..log import ConvertedResourceLog + +import sqlite3 +import zipfile + +logger = logging.getLogger(__name__) + + +class ConvertPhase(PolarsPhase): + """ + Detect and convert input file format then load into a Polars DataFrame. + + Re-uses the existing format-detection and conversion helpers so the + behaviour is identical to the streaming ConvertPhase. + """ + + def __init__( + self, + path=None, + dataset_resource_log=None, + converted_resource_log=None, + output_path=None, + ): + self.path = path + self.dataset_resource_log = dataset_resource_log + self.converted_resource_log = converted_resource_log + self.charset = "" + self.output_path = output_path + if output_path: + output_dir = os.path.dirname(str(output_path)) + if output_dir and not os.path.exists(output_dir): + os.makedirs(output_dir) + + def _resource_from_path(self, path): + return Path(path).stem + + def _find_zip_file(self, input_file, suffix=".gml"): + zip_ = zipfile.ZipFile(input_file) + files = zip_.namelist() + files = list( + set( + filter( + lambda s: s.endswith(suffix) or s.endswith(suffix.upper()), files + ) + ) + ) + if not files or not len(files): + return None + if len(files) > 1: + raise ValueError("Zipfile contains more than one %s file" % suffix) + return "/" + files[0] + + def find_internal_path(self, input_path): + for suffix, mime in [ + (".shp", "x-gis/x-shapefile"), + (".gml", "application/gml+xml"), + (".tab", "x-gis/x-mapinfo-tab"), + (".geojson", "application/vnd.geo+json"), + (".json", "application/vnd.geo+json"), + (".kml", "application/vnd.google-earth.kml+xml"), + ]: + internal_path = self._find_zip_file(input_path, suffix) + if internal_path: + return internal_path, mime + return None, None + + def _get_csv_path(self, input_path): + """Return (csv_path, should_delete_temp) by converting the input to CSV if needed.""" + + # Try binary formats first + excel = read_excel(input_path) + if excel is not None: + logger.debug(f"{input_path} looks like excel") + if self.dataset_resource_log: + self.dataset_resource_log.mime_type = "application/vnd.ms-excel" + tmp = self.output_path or tempfile.NamedTemporaryFile( + suffix=".csv", delete=False + ).name + excel.to_csv( + str(tmp), index=False, header=True, encoding="utf-8", quoting=csv.QUOTE_ALL + ) + return str(tmp), False + + if zipfile.is_zipfile(input_path): + logger.debug(f"{input_path} looks like zip") + if self.dataset_resource_log: + self.dataset_resource_log.mime_type = "application/zip" + internal_path, mime_type = self.find_internal_path(input_path) + if internal_path: + if self.dataset_resource_log: + self.dataset_resource_log.internal_path = internal_path + self.dataset_resource_log.internal_mime_type = mime_type + parent = str(self.output_path.parent) if self.output_path else None + tmp = tempfile.NamedTemporaryFile(suffix=".zip", dir=parent).name + os.link(input_path, tmp) + zip_path = f"/vsizip/{tmp}{internal_path}" + csv_path = convert_features_to_csv(zip_path, self.output_path) + return csv_path, False + + try: + conn = sqlite3.connect(input_path) + cursor = conn.cursor() + cursor.execute("pragma quick_check") + conn.close() + logger.debug(f"{input_path} looks like SQLite") + if self.dataset_resource_log: + self.dataset_resource_log.mime_type = "application/geopackage+sqlite3" + csv_path = convert_features_to_csv(input_path, self.output_path) + return csv_path, False + except Exception: + pass + + # Text-based formats + encoding = detect_file_encoding(input_path) + if not encoding: + raise ConversionError(f"Cannot detect encoding for {input_path}") + + self.charset = ";charset=" + encoding + with open(input_path, encoding=encoding) as f: + content = f.read(10) + + if content.lower().startswith(" pl.DataFrame: + if df is None or df.height == 0: + return df + + # Apply default_fields: if field is empty, copy from another field + for field, default_field in self.default_fields.items(): + if default_field not in df.columns: + continue + if field not in df.columns: + df = df.with_columns(pl.lit("").alias(field)) + + df = df.with_columns( + pl.when( + pl.col(field).is_null() + | (pl.col(field) == "") + ) + .then( + pl.when( + pl.col(default_field).is_not_null() + & (pl.col(default_field) != "") + ) + .then(pl.col(default_field)) + .otherwise(pl.col(field)) + ) + .otherwise(pl.col(field)) + .alias(field) + ) + + # Apply default_values: if field is empty, use a fixed default value + for field, value in self.default_values.items(): + if not value: + continue + + if field not in df.columns: + df = df.with_columns(pl.lit("").alias(field)) + + df = df.with_columns( + pl.when( + pl.col(field).is_null() + | (pl.col(field) == "") + ) + .then(pl.lit(value)) + .otherwise(pl.col(field)) + .alias(field) + ) + + return df diff --git a/digital_land/phase_polars/dump.py b/digital_land/phase_polars/dump.py new file mode 100644 index 000000000..cd4ff3ce5 --- /dev/null +++ b/digital_land/phase_polars/dump.py @@ -0,0 +1,29 @@ +import polars as pl + +from .phase import PolarsPhase + + +class DumpPhase(PolarsPhase): + """ + Dump raw data to a CSV file (for the ConvertPhase output). + """ + + def __init__(self, path=None, f=None, enabled=True): + self.path = path + self.f = f + self.enabled = enabled + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if not self.enabled or df is None or df.height == 0: + return df + + data_cols = [c for c in df.columns if not c.startswith("__")] + out_df = df.select(data_cols) + + if self.f: + csv_str = out_df.write_csv() + self.f.write(csv_str) + elif self.path: + out_df.write_csv(str(self.path)) + + return df diff --git a/digital_land/phase_polars/factor.py b/digital_land/phase_polars/factor.py new file mode 100644 index 000000000..7ef532953 --- /dev/null +++ b/digital_land/phase_polars/factor.py @@ -0,0 +1,40 @@ +import hashlib + +import polars as pl + +from .phase import PolarsPhase + + +def fact_hash(entity, field, value): + data = entity + ":" + field + ":" + value + return hashlib.sha256(data.encode("utf-8")).hexdigest() + + +class FactorPhase(PolarsPhase): + """ + Add a fact hash identifier for each fact row. + """ + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + if not all(c in df.columns for c in ["entity", "field", "value"]): + return df + + df = df.with_columns( + pl.struct(["entity", "field", "value"]) + .map_elements( + lambda s: fact_hash( + str(s["entity"] or ""), + str(s["field"] or ""), + str(s["value"] or ""), + ) + if s["entity"] + else "", + return_dtype=pl.Utf8, + ) + .alias("fact") + ) + + return df diff --git a/digital_land/phase_polars/filter.py b/digital_land/phase_polars/filter.py new file mode 100644 index 000000000..4eac43583 --- /dev/null +++ b/digital_land/phase_polars/filter.py @@ -0,0 +1,32 @@ +import re + +import polars as pl + +from .phase import PolarsPhase + + +class FilterPhase(PolarsPhase): + """ + Filter rows based on regex patterns applied to field values. + Only rows where *all* filter patterns match are kept. + """ + + def __init__(self, filters=None): + if filters is None: + filters = {} + self.filters = {} + for field, pattern in filters.items(): + self.filters[field] = re.compile(pattern) + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0 or not self.filters: + return df + + mask = pl.lit(True) + for field, pattern in self.filters.items(): + if field in df.columns: + mask = mask & pl.col(field).fill_null("").str.contains( + f"^(?:{pattern.pattern})" + ) + + return df.filter(mask) diff --git a/digital_land/phase_polars/harmonise.py b/digital_land/phase_polars/harmonise.py new file mode 100644 index 000000000..204f7e4a3 --- /dev/null +++ b/digital_land/phase_polars/harmonise.py @@ -0,0 +1,229 @@ +import logging +from datetime import datetime, date +from calendar import monthrange + +import polars as pl + +from .phase import PolarsPhase +from digital_land.datatype.point import PointDataType +from digital_land.datatype.factory import datatype_factory + +try: + import shapely.wkt +except ImportError: + shapely = None + +logger = logging.getLogger(__name__) + +MANDATORY_FIELDS_DICT = { + "article-4-direction": [ + "reference", "name", "document-url", "documentation-url", + ], + "article-4-direction-area": [ + "reference", "geometry", "name", "permitted-development-rights", + ], + "conservation-area": ["reference", "geometry", "name"], + "conservation-area-document": [ + "reference", "name", "conservation-area", + "document-url", "documentation-url", "document-type", + ], + "tree-preservation-order": [ + "reference", "document-url", "documentation-url", + ], + "tree-preservation-zone": ["reference", "geometry"], + "listed-building-outline": ["reference", "geometry", "name", "listed-building"], + "tree": ["reference", "point", "geometry"], + "brownfield-land": [ + "OrganisationURI", "SiteReference", "SiteNameAddress", "GeoX", "GeoY", + ], +} + +FAR_FUTURE_YEARS_AHEAD = 50 + + +class HarmonisePhase(PolarsPhase): + """ + Harmonise field values according to their datatype specification. + + This phase delegates to the existing datatype normalisation logic on a + per-row basis using map_elements for correctness, since individual + datatype classes contain complex transformation rules. + """ + + def __init__( + self, + field_datatype_map=None, + issues=None, + dataset=None, + valid_category_values=None, + ): + if field_datatype_map is None: + field_datatype_map = {} + if valid_category_values is None: + valid_category_values = {} + self.field_datatype_map = field_datatype_map + self.issues = issues + self.dataset = dataset + self.valid_category_values = valid_category_values + + def _get_far_future_date(self, number_of_years_ahead: int): + today = date.today() + y = today.year + number_of_years_ahead + last_day = monthrange(y, today.month)[1] + day = min(today.day, last_day) + return today.replace(year=y, day=day) + + def _harmonise_row(self, row_dict, resource, line_number, entry_number): + """Harmonise a single row – mirrors the streaming HarmonisePhase exactly.""" + if self.issues: + self.issues.resource = resource + self.issues.line_number = line_number + self.issues.entry_number = entry_number + + o = {} + for field, value in row_dict.items(): + if field.startswith("__"): + continue + + # Category value validation + if field in self.valid_category_values: + if value: + normalised_value = value.replace(" ", "-") + matching_value = next( + ( + v + for v in self.valid_category_values[field] + if v.lower() == normalised_value.lower() + ), + None, + ) + if matching_value: + value = matching_value + else: + if self.issues: + self.issues.log_issue( + field, "invalid category value", value + ) + + # Harmonise via datatype + if not value: + o[field] = "" + elif field in self.field_datatype_map: + if self.issues: + self.issues.fieldname = field + datatype_name = self.field_datatype_map[field] + if datatype_name == "datetime": + far_past_date = date(1799, 12, 31) + far_future_date = self._get_far_future_date(FAR_FUTURE_YEARS_AHEAD) + datatype = datatype_factory( + datatype_name=datatype_name, + far_past_date=far_past_date, + far_future_date=far_future_date, + ) + else: + datatype = datatype_factory(datatype_name=datatype_name) + o[field] = datatype.normalise(value, issues=self.issues) + else: + o[field] = value + + # Future entry-date check + for field in ["entry-date", "LastUpdatedDate"]: + val = o.get(field, "") + if val: + try: + if datetime.strptime(val[:10], "%Y-%m-%d").date() > datetime.today().date(): + if self.issues: + self.issues.log_issue( + field, "future entry-date", row_dict.get(field, ""), + f"{field} must be today or in the past", + ) + o[field] = "" + except (ValueError, TypeError): + pass + + # GeoX/GeoY handling + if "GeoX" in row_dict and "GeoY" in row_dict: + if self.issues: + self.issues.fieldname = "GeoX,GeoY" + point = PointDataType() + try: + geometry = point.normalise( + [o.get("GeoX", ""), o.get("GeoY", "")], + issues=self.issues, + ) + if geometry and shapely: + point_geometry = shapely.wkt.loads(geometry) + x, y = point_geometry.coords[0] + o["GeoX"] = str(x) + o["GeoY"] = str(y) + elif not geometry: + o.pop("GeoX", None) + o.pop("GeoY", None) + except Exception as e: + logger.error( + f"Exception occurred while fetching geoX, geoY coordinates: {e}" + ) + + # Typology prefix + for typology in ["organisation", "geography", "document"]: + value = o.get(typology, "") + if value and ":" not in value: + o[typology] = f"{self.dataset}:{value}" + + # Mandatory field checks + mandatory_fields = MANDATORY_FIELDS_DICT.get(self.dataset) + for field in row_dict: + if field.startswith("__"): + continue + if field in ["geometry", "point"]: + if not row_dict.get("geometry") and not row_dict.get("point"): + if self.issues: + self.issues.log_issue( + field, "missing value", "", f"{field} missing" + ) + elif mandatory_fields and field in mandatory_fields: + if not row_dict.get(field): + if self.issues: + self.issues.log_issue( + field, "missing value", "", f"{field} missing" + ) + + # Wikipedia + if row_dict.get("wikipedia", "").startswith("http"): + if self.issues: + self.issues.log_issue( + "wikipedia", "removed URI prefix", row_dict["wikipedia"] + ) + o["wikipedia"] = row_dict["wikipedia"].replace( + "https://en.wikipedia.org/wiki/", "" + ) + + return o + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + meta_cols = [c for c in df.columns if c.startswith("__")] + data_cols = [c for c in df.columns if not c.startswith("__")] + + results = [] + for row in df.iter_rows(named=True): + resource = row.get("__resource", "") + line_number = row.get("__line_number", 0) + entry_number = row.get("__entry_number", 0) + + harmonised = self._harmonise_row(row, resource, line_number, entry_number) + + # Include metadata + out = {} + for mc in meta_cols: + out[mc] = row[mc] + for field in data_cols: + out[field] = harmonised.get(field, "") + results.append(out) + + if not results: + return df.clear() + + return pl.DataFrame(results, schema={c: pl.Utf8 for c in results[0] if not c.startswith("__")} | {c: df.schema[c] for c in meta_cols if c in df.schema}) diff --git a/digital_land/phase_polars/load.py b/digital_land/phase_polars/load.py new file mode 100644 index 000000000..b0571bc01 --- /dev/null +++ b/digital_land/phase_polars/load.py @@ -0,0 +1,39 @@ +import polars as pl + +from .phase import PolarsPhase + + +class LoadPhase(PolarsPhase): + """ + Load a CSV file into a Polars DataFrame. + """ + + def __init__(self, path=None, resource=None, dataset=None): + self.path = path + self.resource = resource + self.dataset = dataset + + def process(self, df=None): + from pathlib import Path + + path = self.path + resource = self.resource or (Path(path).stem if path else None) + + result = pl.read_csv( + str(path), + infer_schema_length=0, + null_values=[""], + truncate_ragged_lines=True, + ignore_errors=True, + ) + result = result.with_columns(pl.all().cast(pl.Utf8).fill_null("")) + + n = result.height + result = result.with_columns( + pl.lit(resource or "").alias("__resource"), + pl.arange(2, n + 2).alias("__line_number"), + pl.arange(1, n + 1).alias("__entry_number"), + pl.lit(str(path) if path else "").alias("__path"), + ) + + return result diff --git a/digital_land/phase_polars/load/__init__.py b/digital_land/phase_polars/load/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/load/save_database.py b/digital_land/phase_polars/load/save_database.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/load/save_file.py b/digital_land/phase_polars/load/save_file.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/lookup.py b/digital_land/phase_polars/lookup.py new file mode 100644 index 000000000..4abed6534 --- /dev/null +++ b/digital_land/phase_polars/lookup.py @@ -0,0 +1,327 @@ +import re +import logging + +import polars as pl + +from .phase import PolarsPhase + +normalise_pattern = re.compile(r"[^a-z0-9-]") + + +def normalise(value): + return re.sub(normalise_pattern, "", value.lower()) + + +def key(entry_number="", prefix="", reference="", organisation=""): + entry_number = str(entry_number) + prefix = normalise(prefix) + reference = normalise(reference) + organisation = normalise(organisation) + return ",".join([entry_number, prefix, reference, organisation]) + + +class EntityLookupPhase(PolarsPhase): + """ + Look up entity numbers by CURIE (prefix:reference). + """ + + def __init__( + self, + lookups=None, + redirect_lookups=None, + issue_log=None, + operational_issue_log=None, + entity_range=None, + ): + if lookups is None: + lookups = {} + if redirect_lookups is None: + redirect_lookups = {} + self.lookups = lookups + self.redirect_lookups = redirect_lookups + self.issues = issue_log + self.operational_issues = operational_issue_log + self.entity_range = entity_range or [] + + def _lookup(self, prefix="", reference="", organisation="", entry_number=""): + return ( + self.lookups.get( + key(prefix=prefix, entry_number=entry_number), "" + ) + or self.lookups.get( + key(prefix=prefix, organisation=organisation, reference=reference), "" + ) + or self.lookups.get( + key(prefix=prefix, reference=reference), "" + ) + ) + + def _redirect(self, entity): + if self.redirect_lookups and entity: + redirect_entry = self.redirect_lookups.get(str(entity), "") + if redirect_entry: + if redirect_entry["status"] == "301": + return redirect_entry["entity"] + elif redirect_entry["status"] == "410": + return "" + return entity + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + if "entity" not in df.columns: + df = df.with_columns(pl.lit("").alias("entity")) + if "prefix" not in df.columns: + df = df.with_columns(pl.lit("").alias("prefix")) + if "reference" not in df.columns: + df = df.with_columns(pl.lit("").alias("reference")) + + entities = [] + for row in df.iter_rows(named=True): + existing = row.get("entity", "") or "" + prefix = row.get("prefix", "") or "" + reference = row.get("reference", "") or "" + organisation = (row.get("organisation", "") or "").replace( + "local-authority-eng", "local-authority" + ) + entry_number = row.get("__entry_number", "") + line_number = row.get("__line_number", "") + resource = row.get("__resource", "") + + if existing: + entities.append(existing) + continue + + if not prefix: + entities.append("") + continue + + entity = self._lookup( + prefix=prefix, + reference=reference, + organisation=organisation, + entry_number=entry_number, + ) + + if entity and self.entity_range: + try: + if int(entity) not in range( + int(self.entity_range[0]), int(self.entity_range[1]) + ): + if self.issues: + self.issues.resource = resource + self.issues.line_number = line_number + self.issues.entry_number = entry_number + self.issues.log_issue( + "entity", "entity number out of range", entity + ) + except (ValueError, TypeError): + pass + + if not entity: + curie = f"{prefix}:{reference}" + if self.issues: + self.issues.resource = resource + self.issues.line_number = line_number + self.issues.entry_number = entry_number + if not reference: + self.issues.log_issue( + "entity", + "unknown entity - missing reference", + curie, + line_number=line_number, + ) + else: + self.issues.log_issue( + "entity", + "unknown entity", + curie, + line_number=line_number, + ) + if self.operational_issues: + self.operational_issues.log_issue( + "entity", + "unknown entity", + curie, + line_number=line_number, + ) + entities.append("") + else: + entity = self._redirect(entity) + entities.append(entity) + + df = df.with_columns(pl.Series("entity", entities)) + + # Record entity map for issue log + if self.issues: + for row in df.iter_rows(named=True): + entry_number = row.get("__entry_number", "") + entity = row.get("entity", "") + if entity: + self.issues.record_entity_map(entry_number, entity) + + return df + + +class FactLookupPhase(PolarsPhase): + """ + Look up reference-entity for facts. + """ + + def __init__( + self, + lookups=None, + redirect_lookups=None, + issue_log=None, + odp_collections=None, + ): + if lookups is None: + lookups = {} + if redirect_lookups is None: + redirect_lookups = {} + if odp_collections is None: + odp_collections = [] + self.lookups = lookups + self.redirect_lookups = redirect_lookups + self.issues = issue_log + self.odp_collections = odp_collections + + def _lookup(self, prefix="", reference="", organisation=""): + return ( + self.lookups.get( + key(prefix=prefix, organisation=organisation, reference=reference), "" + ) + or self.lookups.get( + key(prefix=prefix, reference=reference), "" + ) + ) + + def _check_associated_organisation(self, entity): + reverse_lookups = {} + for k, v in self.lookups.items(): + if v not in reverse_lookups: + reverse_lookups[v] = [] + reverse_lookups[v].append(k) + + if entity in reverse_lookups: + keywords = {"authority", "development", "government"} + for k in reverse_lookups[entity]: + parts = k.split(",") + if len(parts) > 3 and any(kw in parts[3] for kw in keywords): + return "" + return entity + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + if "reference-entity" not in df.columns: + df = df.with_columns(pl.lit("").alias("reference-entity")) + + ref_entities = [] + for row in df.iter_rows(named=True): + prefix = row.get("prefix", "") or "" + reference = row.get("reference", "") or "" + entity_number = row.get("entity", "") or "" + line_number = row.get("line-number", "") or row.get("__line_number", "") + organisation = (row.get("organisation", "") or "").replace( + "local-authority-eng", "local-authority" + ) + + if not (prefix and reference and entity_number): + ref_entities.append(row.get("reference-entity", "") or "") + continue + + find_entity = self._lookup( + prefix=prefix, organisation=organisation, reference=reference + ) + if not find_entity: + find_entity = self._lookup(prefix=prefix, reference=reference) + find_entity = self._check_associated_organisation(find_entity) + + if not find_entity or ( + str(find_entity) in self.redirect_lookups + and int(self.redirect_lookups[str(find_entity)].get("status", 0)) == 410 + ): + if self.odp_collections and prefix in self.odp_collections: + if self.issues: + self.issues.log_issue( + prefix, + "missing associated entity", + reference, + line_number=line_number, + ) + ref_entities.append("") + else: + ref_entities.append(str(find_entity)) + + df = df.with_columns(pl.Series("reference-entity", ref_entities)) + return df + + +class PrintLookupPhase(PolarsPhase): + """ + Print new lookup entries for unresolved entities. + """ + + def __init__(self, lookups=None, redirect_lookups=None): + if lookups is None: + lookups = {} + if redirect_lookups is None: + redirect_lookups = {} + self.lookups = lookups + self.redirect_lookups = redirect_lookups + self.new_lookup_entries = [] + + def _lookup(self, prefix="", reference="", organisation="", entry_number=""): + return ( + self.lookups.get( + key(prefix=prefix, entry_number=entry_number), "" + ) + or self.lookups.get( + key(prefix=prefix, organisation=organisation, reference=reference), "" + ) + or self.lookups.get( + key(prefix=prefix, reference=reference), "" + ) + ) + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + for row in df.iter_rows(named=True): + prefix = row.get("prefix", "") or "" + organisation = row.get("organisation", "") or "" + reference = row.get("reference", "") or "" + entry_number = row.get("__entry_number", "") + + entity = "" + if prefix: + entity = self._lookup( + prefix=prefix, + reference=reference, + organisation=organisation, + entry_number=entry_number, + ) + + if not entity: + if prefix and organisation and reference: + if "," in reference: + reference = f'"{reference}"' + new_lookup = { + "prefix": prefix, + "organisation": organisation, + "reference": reference, + } + self.new_lookup_entries.append([new_lookup]) + elif not reference: + logging.info( + "No reference found for entry: " + + str(entry_number) + + " in resource: " + + row.get("__resource", "") + ) + + return df diff --git a/digital_land/phase_polars/map.py b/digital_land/phase_polars/map.py new file mode 100644 index 000000000..0fb0cc0b4 --- /dev/null +++ b/digital_land/phase_polars/map.py @@ -0,0 +1,93 @@ +import re + +import polars as pl + +from ..log import ColumnFieldLog +from .phase import PolarsPhase + +normalise_pattern = re.compile(r"[^a-z0-9-_]") + + +def normalise(name): + new_name = name.replace("_", "-") + return re.sub(normalise_pattern, "", new_name.lower()) + + +class MapPhase(PolarsPhase): + """ + Rename columns according to the column map and specification fieldnames. + """ + + def __init__(self, fieldnames, columns=None, log=None): + if columns is None: + columns = {} + self.columns = columns + self.normalised_fieldnames = {normalise(f): f for f in fieldnames} + if not log: + log = ColumnFieldLog() + self.log = log + + def headers(self, column_names): + """Build the header mapping (column_name → field_name).""" + headers = {} + matched = [] + + for header in sorted(column_names): + fieldname = normalise(header) + for pattern, value in self.columns.items(): + if fieldname == pattern: + matched.append(value) + headers[header] = value + + for header in sorted(column_names): + if header in headers: + continue + fieldname = normalise(header) + if fieldname not in matched and fieldname in self.normalised_fieldnames: + headers[header] = self.normalised_fieldnames[fieldname] + + if {"GeoX", "Easting"} <= headers.keys(): + item = headers.pop("GeoX") + headers["GeoX"] = item + + if {"GeoY", "Northing"} <= headers.keys(): + item = headers.pop("GeoY") + headers["GeoY"] = item + + return headers + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + data_cols = [c for c in df.columns if not c.startswith("__")] + header_map = self.headers(data_cols) + + # Log headers + for col, field in header_map.items(): + self.log.add(column=col, field=field) + + # Select only mapped columns (drop unmapped data cols), keep metadata + meta_cols = [c for c in df.columns if c.startswith("__")] + + select_exprs = [] + for col, field in header_map.items(): + if field == "IGNORE": + continue + select_exprs.append(pl.col(col).fill_null("").alias(field)) + + # Add metadata columns + for mc in meta_cols: + select_exprs.append(pl.col(mc)) + + # Handle duplicate target field names - if multiple columns map to the same + # field, keep the last one (matching original generator behaviour) + seen = {} + unique_exprs = [] + for expr in select_exprs: + # Get the output name from the expression + name = expr.meta.output_name() + seen[name] = expr + unique_exprs = list(seen.values()) + + return df.select(unique_exprs) diff --git a/digital_land/phase_polars/migrate.py b/digital_land/phase_polars/migrate.py new file mode 100644 index 000000000..20ece2153 --- /dev/null +++ b/digital_land/phase_polars/migrate.py @@ -0,0 +1,64 @@ +import polars as pl + +from .phase import PolarsPhase + + +class MigratePhase(PolarsPhase): + """ + Rename fields to match the latest specification. + """ + + def __init__(self, fields, migrations): + self.migrations = migrations + self.fields = list( + set(fields + ["entity", "organisation", "prefix", "reference"]) + ) + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + meta_cols = [c for c in df.columns if c.startswith("__")] + data_cols = [c for c in df.columns if not c.startswith("__")] + + exprs = [] + for field in self.fields: + migrated_from = self.migrations.get(field) + if migrated_from and migrated_from in df.columns: + exprs.append(pl.col(migrated_from).alias(field)) + elif field in df.columns: + exprs.append(pl.col(field)) + # else: field not present in df, skip + + # Handle GeoX/GeoY → point conversion + has_geoxy = "GeoX" in df.columns and "GeoY" in df.columns + if has_geoxy and "point" in self.fields: + exprs.append( + pl.when( + pl.col("GeoX").is_not_null() + & (pl.col("GeoX") != "") + & pl.col("GeoY").is_not_null() + & (pl.col("GeoY") != "") + ) + .then( + pl.concat_str( + [pl.lit("POINT("), pl.col("GeoX"), pl.lit(" "), pl.col("GeoY"), pl.lit(")")], + separator="", + ) + ) + .otherwise(pl.lit("")) + .alias("point") + ) + + # Add metadata columns + for mc in meta_cols: + exprs.append(pl.col(mc)) + + # Deduplicate by alias (keep last in case of conflict, e.g. point) + seen = {} + for expr in exprs: + name = expr.meta.output_name() + seen[name] = expr + exprs = list(seen.values()) + + return df.select(exprs) diff --git a/digital_land/phase_polars/normalise.py b/digital_land/phase_polars/normalise.py new file mode 100644 index 000000000..cd03f2785 --- /dev/null +++ b/digital_land/phase_polars/normalise.py @@ -0,0 +1,84 @@ +import csv +import os +import re + +import polars as pl + +from .phase import PolarsPhase + +patch_dir = os.path.join(os.path.dirname(__file__), "../patch") + + +class NormalisePhase(PolarsPhase): + """ + Normalise CSV whitespace, strip null patterns and skip matching rows. + + In the streaming pipeline this operates on raw lines *before* parsing. + In the Polars pipeline it operates on already-parsed string columns + which gives equivalent results. + """ + + null_path = os.path.join(patch_dir, "null.csv") + + def __init__(self, skip_patterns=None): + if skip_patterns is None: + skip_patterns = [] + self.skip_patterns = [re.compile(p) for p in skip_patterns] + + self.null_patterns = [] + if os.path.exists(self.null_path): + for row in csv.DictReader(open(self.null_path, newline="")): + self.null_patterns.append(re.compile(row["pattern"])) + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + # Identify data columns (non-metadata) + data_cols = [c for c in df.columns if not c.startswith("__")] + + # Strip whitespace from all data columns + strip_exprs = [ + pl.col(c) + .str.strip_chars() + .str.replace_all(r"\r", "") + .str.replace_all(r"\n", "\r\n") + .alias(c) + for c in data_cols + ] + if strip_exprs: + df = df.with_columns(strip_exprs) + + # Apply null patterns to all data columns + for pattern in self.null_patterns: + null_exprs = [ + pl.col(c).str.replace_all(pattern.pattern, "").alias(c) + for c in data_cols + ] + if null_exprs: + df = df.with_columns(null_exprs) + + # Remove completely blank rows (all data columns empty or null) + if data_cols: + not_blank = pl.lit(False) + for c in data_cols: + not_blank = not_blank | ( + pl.col(c).is_not_null() & (pl.col(c) != "") + ) + df = df.filter(not_blank) + + # Skip rows matching skip patterns (matched against full comma-joined line) + if self.skip_patterns and data_cols: + concat_expr = pl.concat_str( + [pl.col(c).fill_null("") for c in data_cols], separator="," + ).alias("__skip_line") + df = df.with_columns(concat_expr) + + for pattern in self.skip_patterns: + df = df.filter( + ~pl.col("__skip_line").str.contains(pattern.pattern) + ) + + df = df.drop("__skip_line") + + return df diff --git a/digital_land/phase_polars/organisation.py b/digital_land/phase_polars/organisation.py new file mode 100644 index 000000000..1c0f8cce7 --- /dev/null +++ b/digital_land/phase_polars/organisation.py @@ -0,0 +1,52 @@ +import polars as pl + +from .phase import PolarsPhase + + +class OrganisationPhase(PolarsPhase): + """ + Look up the organisation value. + """ + + def __init__(self, organisation=None, issues=None): + self.organisation = organisation + self.issues = issues + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + if "organisation" not in df.columns: + df = df.with_columns(pl.lit("").alias("organisation")) + + if self.organisation is None: + return df + + # Apply organisation lookup row-by-row (lookup may be complex) + def _lookup(val): + result = self.organisation.lookup(val if val else "") + return result if result else "" + + df = df.with_columns( + pl.col("organisation") + .map_elements(_lookup, return_dtype=pl.Utf8) + .alias("__org_resolved") + ) + + # Log issues for rows where organisation could not be resolved + if self.issues: + for row in df.filter(pl.col("__org_resolved") == "").iter_rows(named=True): + org_val = row.get("organisation", "") + if org_val: + self.issues.resource = row.get("__resource", "") + self.issues.line_number = row.get("__line_number", 0) + self.issues.entry_number = row.get("__entry_number", 0) + self.issues.log_issue( + "organisation", "invalid organisation", org_val + ) + + df = df.with_columns( + pl.col("__org_resolved").alias("organisation") + ).drop("__org_resolved") + + return df diff --git a/digital_land/phase_polars/patch.py b/digital_land/phase_polars/patch.py new file mode 100644 index 000000000..fa031e365 --- /dev/null +++ b/digital_land/phase_polars/patch.py @@ -0,0 +1,87 @@ +import re + +import polars as pl + +from .phase import PolarsPhase + + +class PatchPhase(PolarsPhase): + """ + Apply regex patches to field values. + """ + + def __init__(self, issues=None, patches=None): + if patches is None: + patches = {} + self.issues = issues + self.patches = patches + + def _apply_patch_value(self, fieldname, value): + """Apply patch to a single value – mirrors streaming logic exactly.""" + patches = {**self.patches.get(fieldname, {}), **self.patches.get("", {})} + for pattern, replacement in patches.items(): + original_pattern = pattern + if pattern == value: + pattern = f"^{re.escape(pattern)}$" + match = re.match(pattern, value, flags=re.IGNORECASE) + if match: + newvalue = match.expand(replacement) + if newvalue != value: + if self.issues: + self.issues.log_issue(fieldname, "patch", value) + return newvalue + return value + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0 or not self.patches: + return df + + data_cols = [c for c in df.columns if not c.startswith("__")] + + # Determine which fields have patches + patched_fields = set(self.patches.keys()) - {""} + global_patches = self.patches.get("", {}) + + fields_to_patch = set() + for col in data_cols: + if col in patched_fields or global_patches: + fields_to_patch.add(col) + + if not fields_to_patch: + return df + + # Use map_elements per field for correctness (regex expand logic is complex) + for field in fields_to_patch: + if field not in df.columns: + continue + + field_patches = { + **self.patches.get(field, {}), + **self.patches.get("", {}), + } + if not field_patches: + continue + + def make_patcher(fname, fpatch): + def _patch(val): + if val is None or val == "": + return val + for pattern, replacement in fpatch.items(): + p = pattern + if p == val: + p = f"^{re.escape(p)}$" + m = re.match(p, val, flags=re.IGNORECASE) + if m: + newval = m.expand(replacement) + return newval + return val + return _patch + + patcher = make_patcher(field, field_patches) + df = df.with_columns( + pl.col(field) + .map_elements(patcher, return_dtype=pl.Utf8) + .alias(field) + ) + + return df diff --git a/digital_land/phase_polars/phase.py b/digital_land/phase_polars/phase.py new file mode 100644 index 000000000..d9dbcbcbb --- /dev/null +++ b/digital_land/phase_polars/phase.py @@ -0,0 +1,14 @@ +import polars as pl + + +class PolarsPhase: + """ + A step in a Polars-based pipeline process. + + Each phase takes a Polars DataFrame and returns a Polars DataFrame. + Metadata columns (prefixed with __) carry through the pipeline: + __resource, __line_number, __entry_number, __path, __dataset, __priority + """ + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + return df diff --git a/digital_land/phase_polars/pivot.py b/digital_land/phase_polars/pivot.py new file mode 100644 index 000000000..4c6772c1f --- /dev/null +++ b/digital_land/phase_polars/pivot.py @@ -0,0 +1,70 @@ +import polars as pl + +from .phase import PolarsPhase + + +class PivotPhase(PolarsPhase): + """ + Unpivot entity rows into a series of facts (one row per field value). + """ + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + meta_cols = [c for c in df.columns if c.startswith("__")] + data_cols = [c for c in df.columns if not c.startswith("__") and c != "entity"] + + if "entity" not in df.columns: + return df + + # We need to carry metadata and entity through the unpivot. + # Polars .unpivot() works on value columns. + # Build the result row-by-row for exact parity with the streaming version. + rows = [] + for row in df.iter_rows(named=True): + entity = row.get("entity", "") + resource = row.get("__resource", "") + line_number = row.get("__line_number", 0) + entry_number = row.get("__entry_number", 0) + priority = row.get("__priority", 1) + entry_date = row.get("entry-date", "") + + for field in sorted(data_cols): + value = row.get(field, "") or "" + rows.append( + { + "fact": "", + "entity": entity, + "field": field, + "value": value, + "priority": str(priority), + "resource": resource, + "line-number": str(line_number), + "entry-number": str(entry_number), + "entry-date": entry_date, + "__resource": resource, + "__line_number": line_number, + "__entry_number": entry_number, + } + ) + + if not rows: + return pl.DataFrame( + schema={ + "fact": pl.Utf8, + "entity": pl.Utf8, + "field": pl.Utf8, + "value": pl.Utf8, + "priority": pl.Utf8, + "resource": pl.Utf8, + "line-number": pl.Utf8, + "entry-number": pl.Utf8, + "entry-date": pl.Utf8, + "__resource": pl.Utf8, + "__line_number": pl.Int64, + "__entry_number": pl.Int64, + } + ) + + return pl.DataFrame(rows) diff --git a/digital_land/phase_polars/prefix.py b/digital_land/phase_polars/prefix.py new file mode 100644 index 000000000..978337be1 --- /dev/null +++ b/digital_land/phase_polars/prefix.py @@ -0,0 +1,30 @@ +import polars as pl + +from .phase import PolarsPhase + + +class EntityPrefixPhase(PolarsPhase): + """ + Ensure every entry has a prefix field. + """ + + def __init__(self, dataset=None): + self.dataset = dataset + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + if "prefix" not in df.columns: + df = df.with_columns(pl.lit(self.dataset).alias("prefix")) + else: + df = df.with_columns( + pl.when( + pl.col("prefix").is_null() | (pl.col("prefix") == "") + ) + .then(pl.lit(self.dataset)) + .otherwise(pl.col("prefix")) + .alias("prefix") + ) + + return df diff --git a/digital_land/phase_polars/priority.py b/digital_land/phase_polars/priority.py new file mode 100644 index 000000000..0a3f9af3d --- /dev/null +++ b/digital_land/phase_polars/priority.py @@ -0,0 +1,59 @@ +import logging + +import polars as pl + +from .phase import PolarsPhase +from digital_land.configuration.main import Config + + +class PriorityPhase(PolarsPhase): + """ + Deduce the priority of each entry when assembling facts. + """ + + def __init__(self, config: Config = None, providers=None): + if providers is None: + providers = [] + self.providers = providers + self.default_priority = 1 + self.config = config + if not config: + logging.warning( + f"No config provided so priority defaults to {self.default_priority}" + ) + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + if "entity" not in df.columns: + df = df.with_columns(pl.lit(self.default_priority).alias("__priority")) + return df + + if self.config: + priorities = [] + organisations = [] + for row in df.iter_rows(named=True): + entity = row.get("entity", "") + authoritative_org = self.config.get_entity_organisation(entity) + if authoritative_org is not None: + if authoritative_org in self.providers: + priorities.append(2) + organisations.append(row.get("organisation", "")) + else: + priorities.append(self.default_priority) + organisations.append(authoritative_org) + else: + priorities.append(self.default_priority) + organisations.append(row.get("organisation", "")) + + df = df.with_columns( + pl.Series("__priority", priorities), + pl.Series("organisation", organisations), + ) + else: + df = df.with_columns( + pl.lit(self.default_priority).alias("__priority") + ) + + return df diff --git a/digital_land/phase_polars/prune.py b/digital_land/phase_polars/prune.py new file mode 100644 index 000000000..6d962cbb5 --- /dev/null +++ b/digital_land/phase_polars/prune.py @@ -0,0 +1,86 @@ +import logging + +import polars as pl + +from .phase import PolarsPhase + +logger = logging.getLogger(__name__) + + +class FieldPrunePhase(PolarsPhase): + """ + Reduce columns to only those specified for the dataset. + """ + + def __init__(self, fields): + self.fields = list( + set(fields + ["entity", "organisation", "prefix", "reference"]) + ) + logging.debug(f"pruning fields to {self.fields}") + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + meta_cols = [c for c in df.columns if c.startswith("__")] + keep = [c for c in self.fields if c in df.columns] + meta_cols + return df.select(keep) + + +class EntityPrunePhase(PolarsPhase): + """ + Remove entries with a missing entity value. + """ + + def __init__(self, issue_log=None, dataset_resource_log=None): + self.log = dataset_resource_log + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + if self.log: + self.log.entry_count = 0 + return df + + if "entity" not in df.columns: + if self.log: + self.log.entry_count = 0 + return df + + # Log skipped rows + missing = df.filter( + pl.col("entity").is_null() | (pl.col("entity") == "") + ) + for row in missing.iter_rows(named=True): + resource = row.get("__resource", "") + prefix = row.get("prefix", "") + reference = row.get("reference", "") + curie = f"{prefix}:{reference}" + entry_number = row.get("__entry_number", "") + logger.info(f"{resource} row {entry_number}: missing entity for {curie}") + + result = df.filter( + pl.col("entity").is_not_null() & (pl.col("entity") != "") + ) + + if self.log: + self.log.entry_count = result.height + + return result + + +class FactPrunePhase(PolarsPhase): + """ + Remove facts with a missing value (except when field is end-date). + """ + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + if "value" not in df.columns: + return df + + return df.filter( + (pl.col("value").is_not_null() & (pl.col("value") != "")) + | (pl.col("field") == "end-date") + ) diff --git a/digital_land/phase_polars/reference.py b/digital_land/phase_polars/reference.py new file mode 100644 index 000000000..6e729d8be --- /dev/null +++ b/digital_land/phase_polars/reference.py @@ -0,0 +1,133 @@ +import re +import logging + +import polars as pl + +from .phase import PolarsPhase + +logger = logging.getLogger(__name__) + +curie_re = re.compile(r"(?P[A-Za-z0-9_-]+):(?P[A-Za-z0-9_-].*)$") + + +def split_curie(value): + match = curie_re.match(value) + if not match: + return ("", value) + return (match.group("prefix"), match.group("reference")) + + +class EntityReferencePhase(PolarsPhase): + """ + Ensure each entry has prefix and reference fields derived from the reference column. + """ + + def __init__(self, dataset=None, prefix=None, issues=None): + self.dataset = dataset + self.prefix = prefix or dataset + self.issues = issues + + def _process_row(self, row_dict): + reference_value = row_dict.get("reference", "") or row_dict.get(self.dataset, "") or "" + ref_prefix, reference = split_curie(reference_value) + + if self.issues and ref_prefix: + self.issues.resource = row_dict.get("__resource", "") + self.issues.line_number = row_dict.get("__line_number", 0) + self.issues.entry_number = row_dict.get("__entry_number", 0) + self.issues.log_issue( + "reference", + "reference value contains reference_prefix", + ref_prefix, + f"Original reference split into prefix '{ref_prefix}' and reference '{reference}'", + ) + + if "UPRN" in ref_prefix: + ref_prefix = "" + + prefix = row_dict.get("prefix", "") or ref_prefix or self.prefix + return prefix, reference + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + if "prefix" not in df.columns: + df = df.with_columns(pl.lit("").alias("prefix")) + if "reference" not in df.columns: + df = df.with_columns(pl.lit("").alias("reference")) + + prefixes = [] + references = [] + for row in df.iter_rows(named=True): + p, r = self._process_row(row) + prefixes.append(p) + references.append(r) + + df = df.with_columns( + pl.Series("prefix", prefixes), + pl.Series("reference", references), + ) + + return df + + +class FactReferencePhase(PolarsPhase): + """ + Ensure a fact which is a reference has prefix and reference fields. + """ + + def __init__( + self, + field_typology_map=None, + field_prefix_map=None, + ): + self.field_typology_map = field_typology_map or {} + self.field_prefix_map = field_prefix_map or {} + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if df is None or df.height == 0: + return df + + if "prefix" not in df.columns: + df = df.with_columns(pl.lit("").alias("prefix")) + if "reference" not in df.columns: + df = df.with_columns(pl.lit("").alias("reference")) + if "field" not in df.columns or "value" not in df.columns: + return df + + ref_typologies = { + "category", "document", "geography", + "organisation", "policy", "legal-instrument", + } + + def _process(row_dict): + prefix = row_dict.get("prefix", "") or "" + reference = row_dict.get("reference", "") or "" + + if prefix and reference: + return prefix, reference + + field = row_dict.get("field", "") + typology = self.field_typology_map.get(field, "") + + if typology in ref_typologies: + value_prefix, value_reference = split_curie(row_dict.get("value", "") or "") + prefix = prefix or value_prefix or self.field_prefix_map.get(field, field) + reference = reference or value_reference + + return prefix, reference + + prefixes = [] + references = [] + for row in df.iter_rows(named=True): + p, r = _process(row) + prefixes.append(p) + references.append(r) + + df = df.with_columns( + pl.Series("prefix", prefixes), + pl.Series("reference", references), + ) + + return df diff --git a/digital_land/phase_polars/save.py b/digital_land/phase_polars/save.py new file mode 100644 index 000000000..f6322c928 --- /dev/null +++ b/digital_land/phase_polars/save.py @@ -0,0 +1,45 @@ +import csv +import logging + +import polars as pl + +from .phase import PolarsPhase + + +class SavePhase(PolarsPhase): + """ + Save the DataFrame to a CSV file, then pass through. + """ + + def __init__(self, path=None, f=None, fieldnames=None, enabled=True): + self.path = path + self.f = f + self.fieldnames = fieldnames + self.enabled = enabled + + def process(self, df: pl.DataFrame) -> pl.DataFrame: + if not self.enabled or df is None or df.height == 0: + return df + + # Select only data columns (non-metadata) + data_cols = [c for c in df.columns if not c.startswith("__")] + + if self.fieldnames: + # Only keep requested fieldnames that exist + keep = sorted([f for f in self.fieldnames if f in data_cols]) + else: + keep = sorted(data_cols) + + if not keep: + return df + + out_df = df.select(keep) + + if self.f: + # Write to file object + csv_str = out_df.write_csv() + self.f.write(csv_str) + elif self.path: + out_df.write_csv(str(self.path)) + + return df diff --git a/digital_land/phase_polars/transform/__init__.py b/digital_land/phase_polars/transform/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/concat_field.py b/digital_land/phase_polars/transform/concat_field.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/convert.py b/digital_land/phase_polars/transform/convert.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/entity_lookup.py b/digital_land/phase_polars/transform/entity_lookup.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/entity_reference.py b/digital_land/phase_polars/transform/entity_reference.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/fact_hash.py b/digital_land/phase_polars/transform/fact_hash.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/field_prune.py b/digital_land/phase_polars/transform/field_prune.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/filter.py b/digital_land/phase_polars/transform/filter.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/flatten.py b/digital_land/phase_polars/transform/flatten.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/map.py b/digital_land/phase_polars/transform/map.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/migrate.py b/digital_land/phase_polars/transform/migrate.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/normalise.py b/digital_land/phase_polars/transform/normalise.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/parse.py b/digital_land/phase_polars/transform/parse.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/patch.py b/digital_land/phase_polars/transform/patch.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/pivot.py b/digital_land/phase_polars/transform/pivot.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/priority.py b/digital_land/phase_polars/transform/priority.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/resolve_organisation.py b/digital_land/phase_polars/transform/resolve_organisation.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/set_default.py b/digital_land/phase_polars/transform/set_default.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/digital_land/phase_polars/transform/validate.py b/digital_land/phase_polars/transform/validate.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/local_testing/Makefile b/local_testing/Makefile new file mode 100644 index 000000000..53835a7a6 --- /dev/null +++ b/local_testing/Makefile @@ -0,0 +1,102 @@ +.PHONY: help init setup-dirs setup-spec check-spec list run run-all fast compare test clean clean-all + +PYTHON := venv/bin/python3 +PIP := venv/bin/pip +SPEC_DIR := ../specification + +help: + @echo "Title Boundary Pipeline" + @echo "" + @echo " make init First time setup (dirs + venv + spec check)" + @echo " make setup-dirs Create all required directories" + @echo " make setup-spec Clone specification files from GitHub" + @echo " make check-spec Check if specification files exist" + @echo " make list List available Local Authorities" + @echo " make run LA=Name Process with comparison (Original + Polars)" + @echo " make run-all Process ALL LAs with comparison" + @echo " make fast LA=Name DuckDB+Parquet with comparison" + @echo " make test Test all module imports" + @echo " make clean Remove generated data" + @echo " make clean-all Remove data + venv" + @echo "" + @echo "Note: All run commands automatically include Polars comparison" + @echo "" + @echo "Examples:" + @echo " make init # Complete setup" + @echo " make setup-spec # Clone specification if missing" + @echo " make run LA=Buckinghamshire LIMIT=100 # Compare both pipelines" + @echo " make run LA=Buckinghamshire PHASES=1,2,9 # Run specific phases" + @echo " make run-all LIMIT=100 # Process all LAs with comparison" + @echo " make fast LA=\"East Sussex\" # Fast mode with comparison" + +setup-dirs: + @mkdir -p raw extracted converted output reports cache pipeline + @echo "✅ Created directories: raw/ extracted/ converted/ output/ reports/ cache/ pipeline/" + +setup-spec: + @if [ -d "$(SPEC_DIR)" ]; then \ + echo "✅ Specification already exists at $(SPEC_DIR)"; \ + else \ + echo "📥 Cloning specification from GitHub..."; \ + cd .. && git clone https://github.com/digital-land/specification.git; \ + if [ -d "$(SPEC_DIR)" ]; then \ + echo "✅ Specification cloned successfully"; \ + echo " Files: $$(ls -1 $(SPEC_DIR)/*.csv 2>/dev/null | wc -l | tr -d ' ') CSV files"; \ + else \ + echo "❌ Failed to clone specification"; \ + exit 1; \ + fi \ + fi + +check-spec: + @if [ -d "$(SPEC_DIR)" ]; then \ + echo "✅ Specification found at $(SPEC_DIR)"; \ + echo " Files: $$(ls -1 $(SPEC_DIR)/*.csv 2>/dev/null | wc -l | tr -d ' ') CSV files"; \ + else \ + echo "❌ Specification not found at $(SPEC_DIR)"; \ + echo ""; \ + echo "Run 'make setup-spec' to clone automatically, or:"; \ + echo " cd ../"; \ + echo " git clone https://github.com/digital-land/specification.git"; \ + echo ""; \ + exit 1; \ + fi + +init: setup-dirs venv setup-spec + @echo "✅ Setup complete - ready to run pipeline" + +venv: + @python3 -m venv venv + @$(PIP) install -q --upgrade pip + @$(PIP) install -q polars duckdb requests + @$(PIP) install -q -e .. + @echo " ✓ Installed digital-land-python in editable mode" + +list: venv + @$(PYTHON) main.py --list + +run: venv + @test -n "$(LA)" || (echo "Error: make run LA=Name"; exit 1) + @$(PYTHON) main.py --la "$(LA)" --compare $(if $(LIMIT),--limit $(LIMIT)) $(if $(PHASES),--phases $(PHASES)) + +run-all: venv + @$(PYTHON) run_all.py $(LIMIT) + +fast: venv + @test -n "$(LA)" || (echo "Error: make fast LA=Name"; exit 1) + @$(PYTHON) main.py --la "$(LA)" --use-duckdb --use-parquet --compare $(if $(LIMIT),--limit $(LIMIT)) + +compare: venv + @test -n "$(LA)" || (echo "Error: make compare LA=Name"; exit 1) + @$(PYTHON) main.py --la "$(LA)" --compare $(if $(LIMIT),--limit $(LIMIT)) + +test: venv + @$(PYTHON) -c "from cli import CLI; from file_downloader import FileDownloader; from gml_extractor import GMLExtractor; from gml_converter import GMLConverter; from pipeline_config import PipelineConfig; from pipeline_runner import PipelineRunner; from pipeline_report import PipelineReport; print('✅ All modules OK')" + +clean: + @rm -rf raw/* extracted/* converted/* output/* reports/* + @echo "✅ Data cleaned" + +clean-all: clean + @rm -rf venv/ cache/* + @echo "✅ All cleaned" diff --git a/local_testing/README.md b/local_testing/README.md new file mode 100644 index 000000000..41de90854 --- /dev/null +++ b/local_testing/README.md @@ -0,0 +1,311 @@ +# Digital Land Pipeline - Local Testing + +A modular, self-contained environment for testing the digital-land transformation pipeline +on various datasets (e.g., UK Land Registry title-boundary data). + +## Architecture + +The pipeline uses a **clean modular architecture** with 8 specialized classes: + +1. **CLI** (121 lines) - Command-line interface and argument parsing +2. **FileDownloader** (95 lines) - Downloads GML files from data sources +3. **GMLExtractor** (50 lines) - Extracts GML from ZIP archives +4. **GMLConverter** (458 lines) - Converts GML to CSV/Parquet (4 strategies) +5. **PipelineConfig** (93 lines) - Manages pipeline configuration files +6. **PipelineRunner** (254 lines) - Executes 26-phase digital-land transformation +7. **PipelineReport** (346 lines) - Performance tracking and reporting +8. **main.py** (265 lines) - Orchestrates the pipeline by calling specialized classes + +**Total**: 2,449 lines across 9 focused, testable modules (down from 1,688 monolithic lines) + +## Prerequisites + +### Specification Files + +The pipeline requires specification files from the digital-land specification repository. These files define schemas, fields, datatypes, and pipeline configurations. + +**Files Used (11 of 25):** +- `dataset.csv`, `schema.csv`, `dataset-schema.csv` +- `datatype.csv`, `field.csv`, `dataset-field.csv`, `schema-field.csv` +- `typology.csv`, `pipeline.csv`, `licence.csv`, `provision-rule.csv` + +The remaining 14 files are not loaded by the pipeline but may be used by other digital-land tools. + +## Quick Start + +**Using Makefile (Recommended):** + +```bash +# Navigate to directory +cd digital-land-python/local_testing + +# First time setup - automatically creates directories, installs dependencies, and clones specification +make init + +# If specification already exists elsewhere, you can symlink it instead: +# cd digital-land-python +# ln -s /path/to/your/specification specification + +# Verify setup +make check-spec + +# List available Local Authorities (for title-boundary dataset) +make list + +# Process a specific LA (includes Polars comparison automatically) +make run LA="Buckinghamshire" + +# Process with record limit +make run LA="Buckinghamshire" LIMIT=100 + +# Process ALL Local Authorities (batch mode with comparison) +make run-all + +# Process all with record limit (for testing) +make run-all LIMIT=100 + +# Run only specific phases (e.g., phases 1,2,9) +make run LA="Buckinghamshire" PHASES="1,2,9" + +# Run range of phases (e.g., phases 1-5 and 9) +make run LA="Buckinghamshire" PHASES="1-5,9" + +# Use best performance (DuckDB + Parquet, includes comparison) +make fast LA="Buckinghamshire" + +# See all available commands +make help + +# Note: All run commands automatically include Polars comparison +``` + +**Manual Setup (Alternative):** + +```bash +# Navigate to local testing directory +cd digital-land-python/local_testing + +# Create virtual environment (first time only) +python3 -m venv venv + +# Activate virtual environment +source venv/bin/activate + +# Install dependencies (first time only) +pip install polars duckdb + +# List available items (dataset-specific) +python main.py --list + +# Process a specific item +python main.py --la "Buckinghamshire" + +# Process with record limit (for testing) +python main.py --la "Buckinghamshire" --limit 100 + +# Skip download if already have the file +python main.py --la "Buckinghamshire" --skip-download + +# Use DuckDB with Parquet for best performance +python main.py --la "Buckinghamshire" --use-duckdb --use-parquet +``` + +## What It Does + +The pipeline performs 5 steps: + +1. **Download** (FileDownloader) - Fetches data files from source API +2. **Extract** (GMLExtractor) - Unzips and locates GML files +3. **Convert** (GMLConverter) - Parses GML and converts to CSV/Parquet (4 methods available) +4. **Transform** (PipelineRunner) - Runs full 26-phase digital-land pipeline +5. **Report** (PipelineReport) - Generates performance report (JSON + text) + +Each step delegates to a specialized class for clean separation of concerns. + +## Directory Structure + +``` +local_testing/ +├── main.py # Main orchestration (265 lines) +├── cli.py # Command-line interface (121 lines) +├── file_downloader.py # Downloads GML files (95 lines) +├── gml_extractor.py # ZIP extraction (50 lines) +├── gml_converter.py # GML conversion (458 lines) +├── pipeline_config.py # Config management (93 lines) +├── pipeline_runner.py # 26-phase transformation (254 lines) +├── pipeline_report.py # Performance tracking (346 lines) +├── polars_phases.py # Polars-optimized phases (767 lines) +├── Makefile # Make commands for easy setup and running +├── README.md # This file +├── .gitignore # Git ignore file +├── venv/ # Virtual environment (created with: make init) +├── raw/ # Downloaded ZIP files +├── extracted/ # Extracted GML files +├── converted/ # GML converted to CSV/Parquet +├── output/ # Pipeline output (harmonised + facts) +├── reports/ # Performance reports +├── cache/ # Organisation.csv cache +├── pipeline/ # Pipeline configuration CSVs +├── specification/ # digital-land specification files +└── scripts/ # Helper scripts +``` + +## Module Overview + +### CLI (`cli.py`) +- Argument parsing with `argparse` +- Fetches endpoint list from GitHub +- Lists and matches data items +- Clean separation of UI logic + +### FileDownloader (`file_downloader.py`) +- Downloads files from APIs +- Progress tracking with byte counts +- Reusable for any file download needs + +### GMLExtractor (`gml_extractor.py`) +- Extracts GML files from ZIP archives +- Handles nested directory structures +- Simple, focused responsibility + +### GMLConverter (`gml_converter.py`) +- **4 conversion strategies**: + 1. Regex → CSV (default, no dependencies) + 2. Regex → Parquet (Polars) + 3. DuckDB → CSV (spatial extension) + 4. DuckDB → Parquet (fastest, best) +- Parses GML polygons to WKT +- Handles coordinate transformation + +### PipelineConfig (`pipeline_config.py`) +- Creates pipeline configuration CSVs +- Downloads organization.csv +- Ensures all config files exist + +### PipelineRunner (`pipeline_runner.py`) +- Executes 26-phase digital-land pipeline +- Lazy imports for fast startup +- Per-phase timing and metrics +- Handles Parquet/CSV input + +### PipelineReport (`pipeline_report.py`) +- Tracks step and phase metrics +- Generates JSON and text reports +- Calculates durations and throughput +- Supports comparison reporting + +## Output Files + +After running the pipeline, you will find: + +**Pipeline Output:** +- `output/{name}_harmonised.csv` - Intermediate harmonised data +- `output/{name}_facts.csv` - Final fact table output +- `output/{name}_issues.csv` - Any issues logged during processing + +**Performance Reports:** + +1. **Single LA Report** (default) + - `reports/{name}_{timestamp}_performance.json` - Detailed JSON report + - `reports/{name}_{timestamp}_performance.txt` - Human-readable text report + - Shows timing for all 26 phases + +2. **Selective Phase Report** (when using `--phases`) + - Same format as above + - Only includes metrics for selected phases + - Useful for testing specific transformations + +3. **Batch Summary Report** (when using `make run-all`) + - `reports/batch_{timestamp}_summary.json` - Aggregate metrics for entire batch + - Includes total time, per-LA timing, success/error counts + - Shows min/max/average processing times across all LAs + - **All run commands now include automatic Polars comparison** (both Original + Polars pipelines) + +## Command Line Options + +| Option | Description | +|--------|-------------| +| `--la NAME` | Item name (partial match) | +| `--limit N` | Limit number of records to process | +| `--skip-download` | Use existing downloaded data | +| `--list` | List all available items | +| `--use-duckdb` | Use DuckDB with spatial extension for GML conversion (faster, proper CRS transform) | +| `--use-parquet` | Output Parquet instead of CSV (faster reads, smaller files) | +| `--phases` | Run specific phases (e.g., `1,2,9` or `1-5,9`) | +| `--compare` | Run both original and Polars pipelines for comparison (enabled by default in Makefile) | + +## GML Conversion Methods + +The **GMLConverter** class supports multiple conversion strategies: + +### Output Formats + +| Format | Flag | Advantages | +|--------|------|------------| +| **CSV** | (default) | Universal, human-readable | +| **Parquet** | `--use-parquet` | 3-10x smaller, faster reads, preserves types | + +### Conversion Engines + +| Engine | Flag | Speed | Features | +|--------|------|-------|----------| +| **Regex** | (default) | Slow | No dependencies | +| **DuckDB** | `--use-duckdb` | Fast | Proper CRS transform, spatial extension | + +### Best Performance + +For the fastest conversion, use DuckDB with Parquet output: + +```bash +# Best performance: DuckDB → Parquet +python main.py --la "Buckinghamshire" --use-duckdb --use-parquet +``` + +## Testing the Modular Architecture + +All classes are independently testable: + +```bash +# Navigate to directory +cd digital-land-python/local_testing + +# Activate venv +source venv/bin/activate + +# Verify all modules work +python3 -c " +from cli import CLI +from file_downloader import FileDownloader +from gml_extractor import GMLExtractor +from gml_converter import GMLConverter +from pipeline_config import PipelineConfig +from pipeline_runner import PipelineRunner +from pipeline_report import PipelineReport +print('✅ All modules imported successfully') +" +``` + +## Notes + +- Virtual environment should be created in `local_testing/venv/` +- Entity assignment requires a lookup table (`pipeline/lookup.csv`) +- Without lookups, harmonised data will have empty entity field +- Facts output will be empty without entity lookups +- Coordinates are converted from OSGB (EPSG:27700) to WGS84 +- Requirements: `pip install polars duckdb` +- Parquet uses Snappy compression by default +- Add `venv/` to `.gitignore` to avoid committing virtual environment +- **Reusable for other datasets** - Just update the endpoint URL in CLI or main.py + +## Development + +Each module can be modified independently: + +- **Add new conversion method**: Edit `GMLConverter.convert_to_*()` methods +- **Change CLI options**: Edit `CLI.create_parser()` +- **Add new pipeline phases**: Edit `PipelineRunner.run_full_pipeline()` +- **Modify reporting**: Edit `PipelineReport` metrics and output formats +- **Add new data sources**: Create new downloader classes following `FileDownloader` pattern +- **Adapt for new datasets**: Update endpoint URLs and field mappings in relevant classes + +The modular structure makes it easy to extend and test each component in isolation. diff --git a/local_testing/cli.py b/local_testing/cli.py new file mode 100644 index 000000000..2eb0f7743 --- /dev/null +++ b/local_testing/cli.py @@ -0,0 +1,148 @@ +""" +Command-line interface for title-boundary pipeline. + +Handles argument parsing and provides user-facing CLI functions. +""" + +import argparse +from typing import List, Dict + +from file_downloader import FileDownloader + + +class CLI: + """Command-line interface manager.""" + + ENDPOINT_CSV_URL = "https://raw.githubusercontent.com/digital-land/config/main/collection/title-boundary/endpoint.csv" + + @staticmethod + def create_parser() -> argparse.ArgumentParser: + """ + Create argument parser for CLI. + + Returns: + Configured ArgumentParser instance + """ + parser = argparse.ArgumentParser( + description="Title Boundary Pipeline - Download, Convert, and Transform", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python main.py # List available LAs + python main.py --la "Buckinghamshire" # Process Buckinghamshire + python main.py --la "Buckinghamshire" --limit 100 # Limit to 100 records + python main.py --use-duckdb --use-parquet # Best performance + """, + ) + + parser.add_argument( + "--la", type=str, help="Local Authority name (partial match)" + ) + parser.add_argument( + "--limit", type=int, help="Limit number of records to process" + ) + parser.add_argument( + "--skip-download", + action="store_true", + help="Skip download, use existing data", + ) + parser.add_argument( + "--list", action="store_true", help="List available Local Authorities" + ) + parser.add_argument( + "--use-duckdb", + action="store_true", + help="Use DuckDB for GML conversion (faster)", + ) + parser.add_argument( + "--use-parquet", action="store_true", help="Output Parquet instead of CSV" + ) + parser.add_argument( + "--compare", + action="store_true", + help="Run both original and Polars pipelines for performance comparison", + ) + parser.add_argument( + "--phases", + type=str, + help="Comma-separated phase numbers to run (e.g. '1,2,9' or '1-5,9')", + ) + + return parser + + @classmethod + def fetch_endpoint_list(cls) -> List[Dict]: + """ + Fetch list of available endpoints from Land Registry API. + + Returns: + List of endpoint dictionaries + """ + return FileDownloader().fetch_endpoint_list() + + @staticmethod + def get_la_name_from_url(url: str) -> str: + """ + Extract Local Authority name from endpoint URL. + + Args: + url: Endpoint URL + + Returns: + Formatted LA name + """ + return FileDownloader.get_la_name_from_url(url) + + @classmethod + def list_available_las(cls): + """List all available Local Authorities to console.""" + endpoints = cls.fetch_endpoint_list() + + print(f"\n{'='*60}") + print("Available Local Authorities") + print(f"{'='*60}\n") + + for i, ep in enumerate(endpoints, 1): + name = ep.get("local_authority", "Unknown") + print(f" {i:3d}. {name}") + + print(f"\n{'='*60}") + print(f"Total: {len(endpoints)} Local Authorities") + print(f"{'='*60}\n") + + return endpoints + + @classmethod + def find_matching_la(cls, search_term: str) -> tuple: + """ + Find Local Authority matching search term. + + Args: + search_term: Partial LA name to search for + + Returns: + Tuple of (matching_endpoint, la_name) or (None, None) if no match/multiple matches + """ + endpoints = cls.fetch_endpoint_list() + matching = [ + ep + for ep in endpoints + if search_term.lower() in ep.get("local_authority", "").lower() + ] + + if not matching: + print(f"Error: No Local Authority matching '{search_term}'") + print("Use --list to see available options") + return None, None + + if len(matching) > 1: + print(f"Multiple matches for '{search_term}':") + for ep in matching: + print(f" - {ep.get('local_authority', 'Unknown')}") + print("Please be more specific") + return None, None + + endpoint = matching[0] + la_name = endpoint.get("local_authority", "Unknown") + + return endpoint, la_name diff --git a/local_testing/file_downloader.py b/local_testing/file_downloader.py new file mode 100644 index 000000000..d3bcb4eaf --- /dev/null +++ b/local_testing/file_downloader.py @@ -0,0 +1,201 @@ +""" +File downloader for title-boundary GML files. + +Handles fetching endpoint lists from GitHub config repository +and downloading ZIP files with progress tracking. +""" + +import csv +import urllib.request +from pathlib import Path +from typing import List, Optional + +try: + import requests + + HAS_REQUESTS = True +except ImportError: + HAS_REQUESTS = False + + +class FileDownloader: + """Handles downloading title-boundary files from endpoint CSV.""" + + ENDPOINT_CSV_URL = "https://raw.githubusercontent.com/digital-land/config/main/collection/title-boundary/endpoint.csv" + + def __init__(self, endpoint_csv_url: Optional[str] = None): + """Initialize downloader with optional custom endpoint CSV URL.""" + self.endpoint_csv_url = endpoint_csv_url or self.ENDPOINT_CSV_URL + + def fetch_endpoint_list(self) -> List[dict]: + """Fetch list of available title boundary datasets from GitHub CSV.""" + print(f" Fetching endpoint list from {self.endpoint_csv_url}...") + + req = urllib.request.Request( + self.endpoint_csv_url, + headers={ + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" + }, + ) + + with urllib.request.urlopen(req) as response: + content = response.read().decode("utf-8") + reader = csv.DictReader(content.splitlines()) + + endpoints = [] + for row in reader: + url = row.get("endpoint-url", "").strip() + if url: + endpoints.append( + { + "endpoint": row.get("endpoint", ""), + "url": url, + "local_authority": self.get_la_name_from_url(url), + "entry_date": row.get("entry-date", ""), + } + ) + + print(f" Found {len(endpoints)} endpoints") + return endpoints + + @staticmethod + def get_la_name_from_url(url: str) -> str: + """Extract Local Authority name from download URL.""" + # URL format: .../download/Buckinghamshire_Council.zip + parts = url.split("/") + if parts: + filename = parts[-1].replace(".zip", "").replace("_", " ") + # Remove common suffixes for cleaner names + for suffix in [ + " Council", + " Borough Council", + " City Council", + " District Council", + " Metropolitan Borough Council", + " County Council", + ]: + if filename.endswith(suffix): + filename = filename[: -len(suffix)] + break + # Remove prefixes + for prefix in [ + "Borough of ", + "City of ", + "County of ", + "Royal Borough of ", + "London Borough of ", + ]: + if filename.startswith(prefix): + filename = filename[len(prefix) :] + break + return filename.strip() + return "Unknown" + + def download_file( + self, url: str, output_path: Path, chunk_size: int = 8192 + ) -> Path: + """ + Download file from URL to output path with progress tracking. + + Args: + url: URL to download from + output_path: Path where file should be saved + chunk_size: Size of download chunks in bytes + + Returns: + Path to downloaded file + """ + output_path.parent.mkdir(parents=True, exist_ok=True) + + print(f" Downloading from {url}") + print(f" Output: {output_path}") + + # Use requests library if available (better redirect/cookie handling) + if HAS_REQUESTS: + return self._download_with_requests(url, output_path, chunk_size) + else: + return self._download_with_urllib(url, output_path, chunk_size) + + def _download_with_requests( + self, url: str, output_path: Path, chunk_size: int + ) -> Path: + """Download using requests library (handles redirects better).""" + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-GB,en;q=0.9", + } + + session = requests.Session() + session.headers.update(headers) + + response = session.get(url, stream=True, allow_redirects=True, timeout=30) + response.raise_for_status() + + total_size = int(response.headers.get("content-length", 0)) + downloaded = 0 + + with open(output_path, "wb") as f: + for chunk in response.iter_content(chunk_size=chunk_size): + if chunk: + f.write(chunk) + downloaded += len(chunk) + + if total_size > 0: + progress = (downloaded / total_size) * 100 + mb_downloaded = downloaded / (1024 * 1024) + mb_total = total_size / (1024 * 1024) + print( + f"\r Progress: {progress:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)", + end="", + flush=True, + ) + + print() # New line after progress + print(f" ✓ Downloaded {downloaded:,} bytes") + return output_path + + def _download_with_urllib( + self, url: str, output_path: Path, chunk_size: int + ) -> Path: + """Download using urllib (fallback).""" + + # Add comprehensive browser headers to mimic real browser + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", + "Accept-Language": "en-GB,en;q=0.9", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + } + + req = urllib.request.Request(url, headers=headers) + + with urllib.request.urlopen(req) as response: + total_size = int(response.headers.get("content-length", 0)) + downloaded = 0 + + with open(output_path, "wb") as f: + while True: + chunk = response.read(chunk_size) + if not chunk: + break + f.write(chunk) + downloaded += len(chunk) + + if total_size > 0: + progress = (downloaded / total_size) * 100 + print( + f"\r Progress: {progress:.1f}% ({downloaded:,}/{total_size:,} bytes)", + end="", + ) + + print() # New line after progress + size_mb = output_path.stat().st_size / (1024 * 1024) + print(f" Downloaded: {size_mb:.1f} MB") + + return output_path diff --git a/local_testing/gml_converter.py b/local_testing/gml_converter.py new file mode 100644 index 000000000..c35fbba78 --- /dev/null +++ b/local_testing/gml_converter.py @@ -0,0 +1,504 @@ +""" +GML converter for title-boundary datasets. + +Provides multiple conversion strategies: +- Regex-based CSV conversion +- Polars-based Parquet conversion +- DuckDB-based conversion (fastest, with spatial transforms) +""" + +import csv +import re +from pathlib import Path +from typing import Optional + + +class GMLConverter: + """Converts GML files to CSV/Parquet with multiple strategies.""" + + @staticmethod + def extract_polygon_wkt(geometry_text: str) -> str: + """ + Extract polygon coordinates and convert to WKT format. + + Handles both exterior rings and interior rings (holes). + + Args: + geometry_text: GML geometry element text + + Returns: + WKT polygon string, or empty string if no valid geometry + """ + exterior_match = re.search( + r".*?([^<]+).*?", + geometry_text, + re.DOTALL, + ) + + if not exterior_match: + return "" + + exterior_coords_raw = exterior_match.group(1).strip().split() + exterior_coords = [] + for i in range(0, len(exterior_coords_raw), 2): + if i + 1 < len(exterior_coords_raw): + exterior_coords.append( + f"{exterior_coords_raw[i]} {exterior_coords_raw[i+1]}" + ) + + if not exterior_coords: + return "" + + # Extract interior rings (holes) + interior_rings = [] + interior_matches = re.findall( + r".*?([^<]+).*?", + geometry_text, + re.DOTALL, + ) + + for interior_coords_raw in interior_matches: + coords = interior_coords_raw.strip().split() + ring_coords = [] + for i in range(0, len(coords), 2): + if i + 1 < len(coords): + ring_coords.append(f"{coords[i]} {coords[i+1]}") + if ring_coords: + interior_rings.append(ring_coords) + + exterior_wkt = f"({', '.join(exterior_coords)})" + if interior_rings: + interior_wkts = [f"({', '.join(ring)})" for ring in interior_rings] + return f"POLYGON({exterior_wkt}, {', '.join(interior_wkts)})" + return f"POLYGON({exterior_wkt})" + + @staticmethod + def extract_field(text: str, field_name: str) -> str: + """ + Extract a field value from GML text. + + Args: + text: GML text to search + field_name: Field name to extract + + Returns: + Field value, or empty string if not found + """ + pattern = f"([^<]+)" + match = re.search(pattern, text) + return match.group(1) if match else "" + + def convert_to_csv( + self, gml_path: Path, csv_path: Path, limit: Optional[int] = None + ) -> int: + """ + Convert GML file to CSV format using regex parsing. + + This is the baseline method - slower but doesn't require external dependencies. + + Args: + gml_path: Path to input GML file + csv_path: Path to output CSV file + limit: Optional limit on number of records to convert + + Returns: + Number of records converted + """ + print(f" Converting GML to CSV...") + print(f" Input: {gml_path}") + print(f" Output: {csv_path}") + + size_mb = gml_path.stat().st_size / (1024 * 1024) + print(f" GML size: {size_mb:.1f} MB") + + with open(gml_path, "r", encoding="utf-8") as f: + content = f.read() + + # Find all cadastral parcel elements + pattern = r"]*>(.*?)" + matches = re.findall(pattern, content, re.DOTALL) + total_features = len(matches) + print(f" Found {total_features} cadastral parcels") + + if limit: + print(f" Limiting to {limit} records") + + fieldnames = [ + "reference", + "name", + "national-cadastral-reference", + "geometry", + "start-date", + "entry-date", + "end-date", + "prefix", + "organisation", + "notes", + ] + + csv_path.parent.mkdir(parents=True, exist_ok=True) + count = 0 + + with open(csv_path, "w", newline="", encoding="utf-8") as csvfile: + writer = csv.DictWriter( + csvfile, fieldnames=fieldnames, extrasaction="ignore" + ) + writer.writeheader() + + for match in matches: + feature = {} + + inspire_id = self.extract_field(match, "INSPIREID") + if inspire_id: + feature["reference"] = inspire_id + feature["name"] = inspire_id + + ncr = self.extract_field(match, "NATIONALCADASTRALREFERENCE") + if ncr: + feature["national-cadastral-reference"] = ncr + + valid_from = self.extract_field(match, "VALIDFROM") + if valid_from: + feature["start-date"] = ( + valid_from.split("T")[0] if "T" in valid_from else valid_from + ) + + begin_lifespan = self.extract_field(match, "BEGINLIFESPANVERSION") + if begin_lifespan: + feature["entry-date"] = ( + begin_lifespan.split("T")[0] + if "T" in begin_lifespan + else begin_lifespan + ) + + geometry_match = re.search( + r"(.*?)", match, re.DOTALL + ) + if geometry_match: + wkt = self.extract_polygon_wkt(geometry_match.group(1)) + if wkt: + feature["geometry"] = wkt + + if "reference" in feature: + feature["prefix"] = "title-boundary" + feature["organisation"] = "government-organisation:D2" + writer.writerow(feature) + count += 1 + + if count % 5000 == 0: + print(f" Converted {count}/{total_features} features...") + + if limit and count >= limit: + break + + print(f" Converted {count} records to CSV") + return count + + def convert_to_parquet( + self, gml_path: Path, parquet_path: Path, limit: Optional[int] = None + ) -> int: + """ + Convert GML file to Parquet format using regex parsing + Polars. + + Parquet is faster to read than CSV and preserves data types. + Falls back to CSV if Polars is not installed. + + Args: + gml_path: Path to input GML file + parquet_path: Path to output Parquet file + limit: Optional limit on number of records to convert + + Returns: + Number of records converted + """ + try: + import polars as pl + except ImportError: + print(" Polars not installed. Install with: pip install polars") + print(" Falling back to CSV...") + csv_path = parquet_path.with_suffix(".csv") + return self.convert_to_csv(gml_path, csv_path, limit) + + print(f" Converting GML to Parquet...") + print(f" Input: {gml_path}") + print(f" Output: {parquet_path}") + + size_mb = gml_path.stat().st_size / (1024 * 1024) + print(f" GML size: {size_mb:.1f} MB") + + with open(gml_path, "r", encoding="utf-8") as f: + content = f.read() + + # Find all cadastral parcel elements + pattern = r"]*>(.*?)" + matches = re.findall(pattern, content, re.DOTALL) + total_features = len(matches) + print(f" Found {total_features} cadastral parcels") + + if limit: + print(f" Limiting to {limit} records") + matches = matches[:limit] + + # Build list of records + records = [] + for match in matches: + feature = {} + + inspire_id = self.extract_field(match, "INSPIREID") + if inspire_id: + feature["reference"] = inspire_id + feature["name"] = inspire_id + + ncr = self.extract_field(match, "NATIONALCADASTRALREFERENCE") + if ncr: + feature["national-cadastral-reference"] = ncr + + valid_from = self.extract_field(match, "VALIDFROM") + if valid_from: + feature["start-date"] = ( + valid_from.split("T")[0] if "T" in valid_from else valid_from + ) + + begin_lifespan = self.extract_field(match, "BEGINLIFESPANVERSION") + if begin_lifespan: + feature["entry-date"] = ( + begin_lifespan.split("T")[0] + if "T" in begin_lifespan + else begin_lifespan + ) + + geometry_match = re.search( + r"(.*?)", match, re.DOTALL + ) + if geometry_match: + wkt = self.extract_polygon_wkt(geometry_match.group(1)) + if wkt: + feature["geometry"] = wkt + + if "reference" in feature: + feature["prefix"] = "title-boundary" + feature["organisation"] = "government-organisation:D2" + feature["end-date"] = None + feature["notes"] = None + records.append(feature) + + # Create DataFrame and write to Parquet + parquet_path.parent.mkdir(parents=True, exist_ok=True) + + df = pl.DataFrame(records) + df.write_parquet(parquet_path, compression="snappy") + + count = len(records) + print(f" Converted {count} records to Parquet") + return count + + def convert_to_parquet_duckdb( + self, gml_path: Path, parquet_path: Path, limit: Optional[int] = None + ) -> int: + """ + Convert GML file to Parquet format using DuckDB with spatial extension. + + This is the fastest method - DuckDB reads GML directly and writes Parquet. + Falls back to Polars-based converter if DuckDB is not available. + + Args: + gml_path: Path to input GML file + parquet_path: Path to output Parquet file + limit: Optional limit on number of records to convert + + Returns: + Number of records converted + """ + try: + import duckdb + except ImportError: + print(" DuckDB not installed. Install with: pip install duckdb") + print(" Falling back to Polars-based converter...") + return self.convert_to_parquet(gml_path, parquet_path, limit) + + print(f" Converting GML to Parquet using DuckDB...") + print(f" Input: {gml_path}") + print(f" Output: {parquet_path}") + + size_mb = gml_path.stat().st_size / (1024 * 1024) + print(f" GML size: {size_mb:.1f} MB") + + parquet_path.parent.mkdir(parents=True, exist_ok=True) + + try: + con = duckdb.connect() + try: + con.execute("INSTALL spatial; LOAD spatial;") + print(" Loaded DuckDB spatial extension") + except Exception as ext_err: + print(f" Failed to load spatial extension: {ext_err}") + print(" Falling back to Polars-based converter...") + con.close() + return self.convert_to_parquet(gml_path, parquet_path, limit) + + print(" Reading GML file...") + limit_clause = f"LIMIT {limit}" if limit else "" + + query = f""" + SELECT + INSPIREID as reference, + INSPIREID as name, + NATIONALCADASTRALREFERENCE as "national-cadastral-reference", + ST_AsText(ST_Transform(geom, 'EPSG:27700', 'EPSG:4326')) as geometry, + CASE + WHEN VALIDFROM IS NOT NULL + THEN strftime(CAST(VALIDFROM AS DATE), '%Y-%m-%d') + ELSE NULL + END as "start-date", + CASE + WHEN BEGINLIFESPANVERSION IS NOT NULL + THEN strftime(CAST(BEGINLIFESPANVERSION AS DATE), '%Y-%m-%d') + ELSE NULL + END as "entry-date", + NULL as "end-date", + 'title-boundary' as prefix, + 'government-organisation:D2' as organisation, + NULL as notes + FROM ST_Read('{gml_path}') + WHERE INSPIREID IS NOT NULL + {limit_clause} + """ + + count_query = f"SELECT COUNT(*) FROM ST_Read('{gml_path}')" + total_count = con.execute(count_query).fetchone()[0] + print(f" Found {total_count:,} cadastral parcels") + + if limit: + print(f" Limiting to {limit} records") + + # Export directly to Parquet (much faster than CSV) + print(" Transforming and writing to Parquet...") + con.execute( + f"COPY ({query}) TO '{parquet_path}' (FORMAT PARQUET, COMPRESSION 'snappy')" + ) + + # Count output rows + result_count = con.execute( + f"SELECT COUNT(*) FROM read_parquet('{parquet_path}')" + ).fetchone()[0] + + con.close() + + print(f" Converted {result_count:,} records to Parquet") + return result_count + + except Exception as e: + print(f" DuckDB conversion failed: {e}") + print(" Falling back to Polars-based converter...") + return self.convert_to_parquet(gml_path, parquet_path, limit) + + def convert_to_csv_duckdb( + self, gml_path: Path, csv_path: Path, limit: Optional[int] = None + ) -> int: + """ + Convert GML file to CSV format using DuckDB with spatial extension. + + This is significantly faster than regex parsing and properly handles: + - Coordinate transformations (OSGB EPSG:27700 to WGS84 EPSG:4326) + - Complex geometries (multi-polygons, holes) + - Large files with streaming + + Note: For even better performance, use convert_to_parquet_duckdb() instead. + Falls back to regex-based converter if DuckDB is not available. + + Args: + gml_path: Path to input GML file + csv_path: Path to output CSV file + limit: Optional limit on number of records to convert + + Returns: + Number of records converted + """ + try: + import duckdb + except ImportError: + print(" DuckDB not installed. Install with: pip install duckdb") + print(" Falling back to regex-based converter...") + return self.convert_to_csv(gml_path, csv_path, limit) + + print(f" Converting GML to CSV using DuckDB...") + print(f" Input: {gml_path}") + print(f" Output: {csv_path}") + + size_mb = gml_path.stat().st_size / (1024 * 1024) + print(f" GML size: {size_mb:.1f} MB") + + csv_path.parent.mkdir(parents=True, exist_ok=True) + + try: + # Create DuckDB connection and load spatial extension + con = duckdb.connect() + try: + con.execute("INSTALL spatial; LOAD spatial;") + print(" Loaded DuckDB spatial extension") + except Exception as ext_err: + print(f" Failed to load spatial extension: {ext_err}") + print(" This may be a network issue. Try running:") + print( + " python -c \"import duckdb; duckdb.connect().execute('INSTALL spatial')\"" + ) + print(" Falling back to regex-based converter...") + con.close() + return self.convert_to_csv(gml_path, csv_path, limit) + + # Read GML file using ST_Read (GDAL-based) + print(" Reading GML file...") + + limit_clause = f"LIMIT {limit}" if limit else "" + + query = f""" + SELECT + INSPIREID as reference, + INSPIREID as name, + NATIONALCADASTRALREFERENCE as "national-cadastral-reference", + ST_AsText(ST_Transform(geom, 'EPSG:27700', 'EPSG:4326')) as geometry, + CASE + WHEN VALIDFROM IS NOT NULL + THEN strftime(CAST(VALIDFROM AS DATE), '%Y-%m-%d') + ELSE NULL + END as "start-date", + CASE + WHEN BEGINLIFESPANVERSION IS NOT NULL + THEN strftime(CAST(BEGINLIFESPANVERSION AS DATE), '%Y-%m-%d') + ELSE NULL + END as "entry-date", + NULL as "end-date", + 'title-boundary' as prefix, + 'government-organisation:D2' as organisation, + NULL as notes + FROM ST_Read('{gml_path}') + WHERE INSPIREID IS NOT NULL + {limit_clause} + """ + + # Execute and get count first + count_query = f"SELECT COUNT(*) FROM ST_Read('{gml_path}')" + total_count = con.execute(count_query).fetchone()[0] + print(f" Found {total_count:,} cadastral parcels") + + if limit: + print(f" Limiting to {limit} records") + + # Export directly to CSV + print(" Transforming and writing to CSV...") + con.execute(f"COPY ({query}) TO '{csv_path}' (HEADER, DELIMITER ',')") + + # Count output rows + result_count = con.execute( + f"SELECT COUNT(*) FROM read_csv('{csv_path}')" + ).fetchone()[0] + + con.close() + + print(f" Converted {result_count:,} records to CSV") + return result_count + + except Exception as e: + print(f" DuckDB conversion failed: {e}") + print(" Falling back to regex-based converter...") + return self.convert_to_csv(gml_path, csv_path, limit) diff --git a/local_testing/gml_extractor.py b/local_testing/gml_extractor.py new file mode 100644 index 000000000..767ed8b19 --- /dev/null +++ b/local_testing/gml_extractor.py @@ -0,0 +1,50 @@ +""" +GML extractor for title-boundary datasets. + +Handles extraction of GML files from ZIP archives. +""" + +import zipfile +from pathlib import Path + + +class GMLExtractor: + """Extracts GML files from ZIP archives.""" + + @staticmethod + def extract_gml_from_zip(zip_path: Path, output_dir: Path) -> Path: + """ + Extract GML file from ZIP archive. + + Args: + zip_path: Path to ZIP file + output_dir: Directory to extract GML file to + + Returns: + Path to extracted GML file + + Raises: + ValueError: If no GML file found in archive + """ + output_dir.mkdir(parents=True, exist_ok=True) + + print(f" Extracting GML from {zip_path}") + + with zipfile.ZipFile(zip_path, "r") as zip_ref: + # Find GML file in archive + gml_files = [f for f in zip_ref.namelist() if f.lower().endswith(".gml")] + + if not gml_files: + raise ValueError(f"No GML file found in {zip_path}") + + gml_filename = gml_files[0] + print(f" Found: {gml_filename}") + + # Extract to output directory + zip_ref.extract(gml_filename, output_dir) + + gml_path = output_dir / gml_filename + size_mb = gml_path.stat().st_size / (1024 * 1024) + print(f" Extracted: {gml_path} ({size_mb:.1f} MB)") + + return gml_path diff --git a/local_testing/main.py b/local_testing/main.py new file mode 100644 index 000000000..8c99c20e7 --- /dev/null +++ b/local_testing/main.py @@ -0,0 +1,370 @@ +#!/usr/bin/env python3 +""" +Title Boundary Pipeline - Download, Convert, and Transform GML data from Land Registry + +Orchestration script that coordinates multiple specialized classes. +""" + +import sys +import time +from pathlib import Path +from datetime import datetime + +from cli import CLI +from file_downloader import FileDownloader +from gml_extractor import GMLExtractor +from gml_converter import GMLConverter +from pipeline_config import PipelineConfig +from pipeline_runner import PipelineRunner +from pipeline_report import PipelineReport + + +# ============================================================================= +# Constants +# ============================================================================= + +SCRIPT_DIR = Path(__file__).parent.resolve() +DATASET = "title-boundary" + + +# ============================================================================= +# Helper Functions +# ============================================================================= + + +def parse_phase_selection(phases_str: str) -> set: + """ + Parse phase selection string into set of phase numbers. + + Args: + phases_str: Comma-separated phase numbers or ranges (e.g., "1,2,9" or "1-5,9") + + Returns: + Set of selected phase numbers, or None if invalid + """ + phases = set() + try: + for part in phases_str.split(","): + part = part.strip() + if "-" in part: + # Range: "1-5" + start, end = part.split("-") + phases.update(range(int(start), int(end) + 1)) + else: + # Single phase: "9" + phases.add(int(part)) + + # Validate phase numbers (1-26) + if any(p < 1 or p > 26 for p in phases): + return None + + return phases + except (ValueError, AttributeError): + return None + + +# ============================================================================= +# Main Entry Point +# ============================================================================= + + +def main(): + """Main entry point for title-boundary pipeline.""" + + # Parse arguments using CLI class + parser = CLI.create_parser() + args = parser.parse_args() + + # Setup directories + raw_dir = SCRIPT_DIR / "raw" + extracted_dir = SCRIPT_DIR / "extracted" + converted_dir = SCRIPT_DIR / "converted" + output_dir = SCRIPT_DIR / "output" + pipeline_dir = SCRIPT_DIR / "pipeline" + specification_dir = SCRIPT_DIR.parent / "specification" + cache_dir = SCRIPT_DIR / "cache" + reports_dir = SCRIPT_DIR / "reports" + + for directory in [ + raw_dir, + extracted_dir, + converted_dir, + output_dir, + pipeline_dir, + cache_dir, + reports_dir, + ]: + directory.mkdir(parents=True, exist_ok=True) + + # List mode - use CLI class + if args.list or not args.la: + CLI.list_available_las() + if not args.la: + print("Use --la 'Name' to process a specific Local Authority") + return 0 + + # Find matching LA - use CLI class + endpoint, la_name = CLI.find_matching_la(args.la) + if not endpoint: + return 1 + + # Initialize + la_slug = la_name.lower().replace(" ", "_").replace(",", "") + report = PipelineReport() + report.local_authority = la_name + report.dataset = DATASET + + # Print header + print(f"\n{'='*60}") + print("Title Boundary Pipeline") + print(f"{'='*60}") + print(f"Local Authority: {la_name}") + print(f"Endpoint: {endpoint['url']}") + if args.limit: + print(f"Limit: {args.limit:,} records") + print(f"{'='*60}\n") + + overall_start = time.time() + + # ========================================================================= + # Step 1: Download - use FileDownloader class + # ========================================================================= + print("Step 1: Download") + print("-" * 40) + + step_download = report.add_step("Download") + zip_path = raw_dir / f"{la_slug}.zip" + + if args.skip_download and zip_path.exists(): + print(f" Using existing: {zip_path}") + step_download.mark_complete(success=True) + else: + downloader = FileDownloader() + success = downloader.download_file(endpoint["url"], zip_path) + step_download.mark_complete(success=success) + if not success: + print(" Download failed") + return 1 + + if zip_path.exists(): + report.zip_size_mb = zip_path.stat().st_size / (1024 * 1024) + + # ========================================================================= + # Step 2: Extract - use GMLExtractor class + # ========================================================================= + print("\nStep 2: Extract") + print("-" * 40) + + step_extract = report.add_step("Extract") + extract_subdir = extracted_dir / la_slug + + try: + gml_path = GMLExtractor.extract_gml_from_zip(zip_path, extract_subdir) + step_extract.mark_complete(success=True) + + if gml_path.exists(): + report.gml_size_mb = gml_path.stat().st_size / (1024 * 1024) + except Exception as e: + print(f" Extraction failed: {e}") + step_extract.mark_complete(success=False) + return 1 + + # ========================================================================= + # Step 3: Convert - use GMLConverter class + # ========================================================================= + output_format = "Parquet" if args.use_parquet else "CSV" + print(f"\nStep 3: Convert GML to {output_format}") + print("-" * 40) + + step_convert = report.add_step("Convert") + converter = GMLConverter() + + # Choose conversion method based on arguments + if args.use_duckdb and args.use_parquet: + method = "DuckDB+Parquet" + output_path = converted_dir / f"{la_slug}.parquet" + record_count = converter.convert_to_parquet_duckdb( + gml_path, output_path, limit=args.limit + ) + elif args.use_duckdb: + method = "DuckDB+CSV" + output_path = converted_dir / f"{la_slug}.csv" + record_count = converter.convert_to_csv_duckdb( + gml_path, output_path, limit=args.limit + ) + elif args.use_parquet: + method = "Polars+Parquet" + output_path = converted_dir / f"{la_slug}.parquet" + record_count = converter.convert_to_parquet( + gml_path, output_path, limit=args.limit + ) + else: + method = "Polars+CSV" + output_path = converted_dir / f"{la_slug}.csv" + record_count = converter.convert_to_csv(gml_path, output_path, limit=args.limit) + + step_convert.mark_complete( + success=record_count > 0, record_count=record_count, method=method + ) + + if record_count == 0: + print(" Conversion produced no records") + return 1 + + report.input_records = record_count + + # ========================================================================= + # Step 4: Transform - use PipelineConfig and PipelineRunner classes + # ========================================================================= + print("\nStep 4: Transform through Pipeline") + print("-" * 40) + + step_transform = report.add_step("Transform") + + # Ensure configuration exists using PipelineConfig class + PipelineConfig.ensure_pipeline_config(pipeline_dir) + + if not specification_dir.exists(): + print(f" Error: Specification directory not found: {specification_dir}") + print(f" Please clone specification to: {specification_dir}") + step_transform.mark_complete(success=False) + return 1 + + # Parse phase selection if provided + selected_phases = None + if args.phases: + selected_phases = parse_phase_selection(args.phases) + if selected_phases: + print(f" Running selected phases: {sorted(selected_phases)}") + report.selected_phases = selected_phases # Store in report for filtering + else: + print(f" Invalid phase selection: {args.phases}") + step_transform.mark_complete(success=False) + return 1 + + # Run pipeline using PipelineRunner class + runner = PipelineRunner(dataset=DATASET) + results = runner.run_full_pipeline( + input_csv=output_path, + output_dir=output_dir, + specification_dir=specification_dir, + pipeline_dir=pipeline_dir, + cache_dir=cache_dir, + la_name=la_name, + report=report, + selected_phases=selected_phases, + ) + + step_transform.mark_complete( + success=True, + harmonised_records=results["harmonised"], + fact_records=results["facts"], + transform_time=results.get("transform_time", 0), + ) + + # Run Polars pipeline for comparison if requested + if args.compare: + print("\n Running Polars pipeline for comparison...") + from polars_phases import run_polars_pipeline + + # Define required parameters + field_datatype_map = {"geometry": "text"} # Simplified for now + intermediate_fieldnames = ["entity", "name", "geometry", "organisation"] + factor_fieldnames = ["entity", "fact"] + + polars_harmonised = output_dir / f"{la_name}_polars_harmonised.csv" + polars_facts = output_dir / f"{la_name}_polars_facts.csv" + + polars_start = time.time() + polars_metrics, polars_harm_count, polars_fact_count = run_polars_pipeline( + input_csv=output_path, + harmonised_csv=polars_harmonised, + facts_csv=polars_facts, + field_datatype_map=field_datatype_map, + intermediate_fieldnames=intermediate_fieldnames, + factor_fieldnames=factor_fieldnames, + dataset=DATASET, + selected_phases=selected_phases, # Pass phase selection to Polars + ) + polars_end = time.time() + + # Store Polars metrics in report + report.polars_phases = [] + for metric in polars_metrics: + from pipeline_report import PhaseMetrics + + phase_metric = PhaseMetrics( + name=metric.name, + phase_number=metric.phase_number, + start_time=0, + end_time=0, + duration_seconds=metric.duration_seconds, + input_count=metric.input_count, + output_count=metric.output_count, + ) + report.polars_phases.append(phase_metric) + + report.polars_harmonised_records = polars_harm_count + report.polars_fact_records = polars_fact_count + report.polars_transform_seconds = polars_end - polars_start + + speedup = ( + results.get("transform_time", 0) / report.polars_transform_seconds + if report.polars_transform_seconds > 0 + else 0 + ) + print(f" Polars transform time: {report.polars_transform_seconds:.3f}s") + print(f" Speedup: {speedup:.1f}x faster") + + # ========================================================================= + # Step 5: Generate Report - use PipelineReport class + # ========================================================================= + overall_end = time.time() + report.total_duration_seconds = overall_end - overall_start + report.calculate_totals() + + print("\nStep 5: Generate Performance Report") + print("-" * 40) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + json_path = reports_dir / f"{la_slug}_{timestamp}_performance.json" + text_path = reports_dir / f"{la_slug}_{timestamp}_performance.txt" + + report.save_json(json_path) + report.save_text(text_path) + + print(f" JSON report: {json_path}") + print(f" Text report: {text_path}") + + # ========================================================================= + # Summary + # ========================================================================= + print(f"\n{'='*60}") + print("PIPELINE COMPLETE") + print(f"{'='*60}") + print(f"Local Authority: {la_name}") + print(f"Dataset: {DATASET}") + print(f"Total Duration: {report.total_duration_seconds:.2f}s") + print(f"Input Records: {report.input_records:,}") + print(f"Harmonised Records: {report.harmonised_records:,}") + print(f"Fact Records: {report.fact_records:,}") + + if report.steps: + print("\nStep Summary:") + for name, step in report.steps.items(): + status = "✓" if step.success else "✗" + print(f" {status} {name:<20} {step.duration_seconds:8.3f}s") + + if report.phases: + total_phase_time = sum(p.duration_seconds for p in report.phases) + print( + f"\nTransform Phases: {len(report.phases)} phases, {total_phase_time:.3f}s total" + ) + + print(f"{'='*60}\n") + + return 0 + + +if __name__ == "__main__": + sys.exit(main() or 0) diff --git a/local_testing/pipeline/column.csv b/local_testing/pipeline/column.csv new file mode 100644 index 000000000..ee2a9062e --- /dev/null +++ b/local_testing/pipeline/column.csv @@ -0,0 +1,11 @@ +dataset,resource,column,field +title-boundary,,reference,reference +title-boundary,,name,name +title-boundary,,geometry,geometry +title-boundary,,start-date,start-date +title-boundary,,entry-date,entry-date +title-boundary,,end-date,end-date +title-boundary,,prefix,prefix +title-boundary,,organisation,organisation +title-boundary,,notes,notes +title-boundary,,national-cadastral-reference,notes diff --git a/local_testing/pipeline/combine.csv b/local_testing/pipeline/combine.csv new file mode 100644 index 000000000..cae5fefa4 --- /dev/null +++ b/local_testing/pipeline/combine.csv @@ -0,0 +1 @@ +dataset,resource,field,fields,separator diff --git a/local_testing/pipeline/concat.csv b/local_testing/pipeline/concat.csv new file mode 100644 index 000000000..cae5fefa4 --- /dev/null +++ b/local_testing/pipeline/concat.csv @@ -0,0 +1 @@ +dataset,resource,field,fields,separator diff --git a/local_testing/pipeline/convert.csv b/local_testing/pipeline/convert.csv new file mode 100644 index 000000000..926bf51e3 --- /dev/null +++ b/local_testing/pipeline/convert.csv @@ -0,0 +1 @@ +dataset,resource,field,value,replacement diff --git a/local_testing/pipeline/default.csv b/local_testing/pipeline/default.csv new file mode 100644 index 000000000..8f30d573f --- /dev/null +++ b/local_testing/pipeline/default.csv @@ -0,0 +1,2 @@ +dataset,resource,field,default-field,entry-date +title-boundary,,,entry-date, diff --git a/local_testing/pipeline/filter.csv b/local_testing/pipeline/filter.csv new file mode 100644 index 000000000..a98026996 --- /dev/null +++ b/local_testing/pipeline/filter.csv @@ -0,0 +1 @@ +dataset,resource,field,pattern diff --git a/local_testing/pipeline/lookup.csv b/local_testing/pipeline/lookup.csv new file mode 100644 index 000000000..fee3f4f07 --- /dev/null +++ b/local_testing/pipeline/lookup.csv @@ -0,0 +1,11 @@ +prefix,resource,organisation,reference,entity +title-boundary,,,33205373,12000000001 +title-boundary,,,60898175,12000000002 +title-boundary,,,33209075,12000000003 +title-boundary,,,55955680,12000000004 +title-boundary,,,37316451,12000000005 +title-boundary,,,26291037,12000000006 +title-boundary,,,30556652,12000000007 +title-boundary,,,42046003,12000000008 +title-boundary,,,32896399,12000000009 +title-boundary,,,42173303,12000000010 diff --git a/local_testing/pipeline/migrate.csv b/local_testing/pipeline/migrate.csv new file mode 100644 index 000000000..728e7bbc3 --- /dev/null +++ b/local_testing/pipeline/migrate.csv @@ -0,0 +1 @@ +dataset,old-field,new-field diff --git a/local_testing/pipeline/patch.csv b/local_testing/pipeline/patch.csv new file mode 100644 index 000000000..478c396a4 --- /dev/null +++ b/local_testing/pipeline/patch.csv @@ -0,0 +1 @@ +dataset,resource,field,pattern,value diff --git a/local_testing/pipeline/redirect.csv b/local_testing/pipeline/redirect.csv new file mode 100644 index 000000000..d3d9f670b --- /dev/null +++ b/local_testing/pipeline/redirect.csv @@ -0,0 +1 @@ +entity,status,redirect-entity diff --git a/local_testing/pipeline/skip.csv b/local_testing/pipeline/skip.csv new file mode 100644 index 000000000..d5f3eaff9 --- /dev/null +++ b/local_testing/pipeline/skip.csv @@ -0,0 +1 @@ +dataset,resource,pattern diff --git a/local_testing/pipeline_config.py b/local_testing/pipeline_config.py new file mode 100644 index 000000000..35aaf338a --- /dev/null +++ b/local_testing/pipeline_config.py @@ -0,0 +1,93 @@ +""" +Pipeline configuration management for title-boundary dataset. + +Handles creation and management of pipeline configuration CSV files +and downloading of required resources like organisation.csv. +""" + +import urllib.request +from pathlib import Path + + +class PipelineConfig: + """Manages pipeline configuration files and resources.""" + + @staticmethod + def ensure_pipeline_config(pipeline_dir: Path): + """ + Ensure all required pipeline configuration CSV files exist. + + Creates default configuration files for: + - column mapping + - default values + - patches, concatenations, combinations + - filters, lookups, migrations, redirects + + Args: + pipeline_dir: Directory where pipeline config files should be created + """ + pipeline_dir.mkdir(parents=True, exist_ok=True) + + configs = { + "column.csv": """dataset,resource,column,field +title-boundary,,reference,reference +title-boundary,,name,name +title-boundary,,geometry,geometry +title-boundary,,start-date,start-date +title-boundary,,entry-date,entry-date +title-boundary,,end-date,end-date +title-boundary,,prefix,prefix +title-boundary,,organisation,organisation +title-boundary,,notes,notes +title-boundary,,national-cadastral-reference,notes +""", + "default.csv": "dataset,resource,field,default-field,entry-date\n", + "patch.csv": "dataset,resource,field,pattern,value\n", + "concat.csv": "dataset,resource,field,fields,separator\n", + "combine.csv": "dataset,resource,field,fields,separator\n", + "convert.csv": "dataset,resource,field,value,replacement\n", + "filter.csv": "dataset,resource,field,pattern\n", + "skip.csv": "dataset,resource,pattern\n", + "lookup.csv": "prefix,resource,organisation,reference,entity\n", + "migrate.csv": "dataset,old-field,new-field\n", + "redirect.csv": "entity,status,redirect-entity\n", + } + + for filename, content in configs.items(): + filepath = pipeline_dir / filename + if not filepath.exists(): + filepath.write_text(content) + + @staticmethod + def download_organisation_csv(cache_dir: Path) -> Path: + """ + Download organisation.csv from digital-land repository if not present. + + Falls back to creating a minimal organisation.csv with Land Registry + data if download fails. + + Args: + cache_dir: Directory where organisation.csv should be cached + + Returns: + Path to organisation.csv file + """ + org_csv = cache_dir / "organisation.csv" + + if not org_csv.exists(): + print(" Downloading organisation.csv...") + url = "https://raw.githubusercontent.com/digital-land/organisation-dataset/main/collection/organisation.csv" + + try: + cache_dir.mkdir(parents=True, exist_ok=True) + urllib.request.urlretrieve(url, org_csv) + print(f" Downloaded organisation.csv ({org_csv.stat().st_size} bytes)") + except Exception as e: + print(f" Warning: Could not download ({e}), creating minimal file") + org_csv.write_text( + "organisation,name,statistical-geography,opendatacommunities-uri\n" + "government-organisation:D2,Land Registry,E92000001," + "http://opendatacommunities.org/id/government-organisation/land-registry\n" + ) + + return org_csv diff --git a/local_testing/pipeline_report.py b/local_testing/pipeline_report.py new file mode 100644 index 000000000..847878a44 --- /dev/null +++ b/local_testing/pipeline_report.py @@ -0,0 +1,497 @@ +""" +Performance reporting and metrics tracking for pipeline runs. + +Provides classes to track timing, resource usage, and comparison +metrics for original vs Polars pipeline implementations. +""" + +import sys +import time +import platform as plat +from pathlib import Path +from datetime import datetime +from typing import Dict, List, Optional, Any +from dataclasses import dataclass, field + + +@dataclass +class PhaseMetrics: + """Metrics for a single pipeline phase.""" + + name: str + phase_number: int + start_time: float = 0.0 + end_time: float = 0.0 + duration_seconds: float = 0.0 + input_count: int = 0 + output_count: int = 0 + + def complete(self, output_count: int = 0): + """Mark phase as complete and calculate duration.""" + self.end_time = time.time() + self.duration_seconds = self.end_time - self.start_time + self.output_count = output_count + + +@dataclass +class StepMetrics: + """Metrics for a pipeline step (Download, Extract, Convert, Transform).""" + + name: str + start_time: float = 0.0 + end_time: float = 0.0 + duration_seconds: float = 0.0 + success: bool = True + details: Dict[str, Any] = field(default_factory=dict) + + def start(self): + """Start timing this step.""" + self.start_time = time.time() + + def complete(self, **details): + """Mark step as complete.""" + self.end_time = time.time() + self.duration_seconds = self.end_time - self.start_time + self.details.update(details) + + def mark_complete(self, success: bool = True, **details): + """Mark step as complete with success status.""" + self.end_time = time.time() + self.duration_seconds = self.end_time - self.start_time + self.success = success + self.details.update(details) + + +@dataclass +class PipelineReport: + """Complete performance report for a pipeline run.""" + + # Run metadata + run_id: str = "" + timestamp: str = "" + local_authority: str = "" + dataset: str = "title-boundary" + record_limit: Optional[int] = None + + # Input/Output metrics + input_records: int = 0 + harmonised_records: int = 0 + fact_records: int = 0 + + # Polars comparison metrics + polars_harmonised_records: int = 0 + polars_fact_records: int = 0 + polars_phases: List[PhaseMetrics] = field(default_factory=list) + polars_transform_seconds: float = 0.0 + + # File sizes + zip_size_mb: float = 0.0 + gml_size_mb: float = 0.0 + csv_size_mb: float = 0.0 + + # Step timings + steps: Dict[str, StepMetrics] = field(default_factory=dict) + + # Phase timings (transformation only) + phases: List[PhaseMetrics] = field(default_factory=list) + + # Phase selection (if running specific phases) + selected_phases: Optional[set] = None + + # Total timing + total_duration_seconds: float = 0.0 + transform_duration_seconds: float = 0.0 + + # System info + python_version: str = "" + platform: str = "" + + def __post_init__(self): + """Initialize run metadata.""" + self.python_version = sys.version.split()[0] + self.platform = f"{plat.system()} {plat.release()}" + self.run_id = datetime.now().strftime("%Y%m%d_%H%M%S") + self.timestamp = datetime.now().isoformat() + + def add_step(self, name: str) -> StepMetrics: + """Add and start a new step.""" + step = StepMetrics(name=name) + step.start() + self.steps[name] = step + return step + + def add_phase(self, name: str, phase_number: int) -> PhaseMetrics: + """Add a new phase.""" + phase = PhaseMetrics( + name=name, phase_number=phase_number, start_time=time.time() + ) + self.phases.append(phase) + return phase + + def calculate_totals(self): + """Calculate total durations.""" + self.total_duration_seconds = sum( + s.duration_seconds for s in self.steps.values() + ) + self.transform_duration_seconds = sum(p.duration_seconds for p in self.phases) + + def to_dict(self) -> Dict: + """Convert to dictionary for JSON serialization.""" + # Filter phases if selection is active + phases_to_output = self.phases + polars_phases_to_output = self.polars_phases + if self.selected_phases: + phases_to_output = [ + p for p in self.phases if p.phase_number in self.selected_phases + ] + polars_phases_to_output = [ + p for p in self.polars_phases if p.phase_number in self.selected_phases + ] + + return { + "run_id": self.run_id, + "timestamp": self.timestamp, + "local_authority": self.local_authority, + "dataset": self.dataset, + "record_limit": self.record_limit, + "selected_phases": ( + list(sorted(self.selected_phases)) if self.selected_phases else None + ), + "input_records": self.input_records, + "harmonised_records": self.harmonised_records, + "fact_records": self.fact_records, + "file_sizes": { + "zip_mb": self.zip_size_mb, + "gml_mb": self.gml_size_mb, + "csv_mb": self.csv_size_mb, + }, + "timing": { + "total_seconds": self.total_duration_seconds, + "transform_seconds": self.transform_duration_seconds, + "polars_transform_seconds": self.polars_transform_seconds, + "speedup_factor": ( + (self.transform_duration_seconds / self.polars_transform_seconds) + if self.polars_transform_seconds > 0 + else 0 + ), + "steps": { + name: {"duration_seconds": s.duration_seconds, **s.details} + for name, s in self.steps.items() + }, + "phases": [ + { + "number": p.phase_number, + "name": p.name, + "duration_seconds": p.duration_seconds, + "output_count": p.output_count, + } + for p in phases_to_output + ], + "polars_phases": [ + { + "number": p.phase_number, + "name": p.name, + "duration_seconds": p.duration_seconds, + "output_count": p.output_count, + } + for p in polars_phases_to_output + ], + }, + "comparison": { + "original_transform_seconds": self.transform_duration_seconds, + "polars_transform_seconds": self.polars_transform_seconds, + "speedup_factor": ( + (self.transform_duration_seconds / self.polars_transform_seconds) + if self.polars_transform_seconds > 0 + else 0 + ), + "time_saved_seconds": self.transform_duration_seconds + - self.polars_transform_seconds, + }, + "system": { + "python_version": self.python_version, + "platform": self.platform, + }, + } + + def generate_text_report(self) -> str: + """Generate human-readable text report.""" + lines = [] + lines.append("=" * 100) + lines.append("TITLE BOUNDARY PIPELINE - PERFORMANCE REPORT") + lines.append("=" * 100) + lines.append("") + lines.append(f"Run ID: {self.run_id}") + lines.append(f"Timestamp: {self.timestamp}") + lines.append(f"Local Authority: {self.local_authority}") + lines.append(f"Dataset: {self.dataset}") + lines.append(f"Record Limit: {self.record_limit or 'None (all records)'}") + lines.append("") + + lines.append("-" * 100) + lines.append("INPUT/OUTPUT SUMMARY") + lines.append("-" * 100) + lines.append(f"Input Records: {self.input_records:,}") + if self.polars_phases: + lines.append( + f"Harmonised Records: {self.harmonised_records:,} (Original) / {self.polars_harmonised_records:,} (Polars)" + ) + lines.append( + f"Fact Records: {self.fact_records:,} (Original) / {self.polars_fact_records:,} (Polars)" + ) + else: + lines.append(f"Harmonised Records: {self.harmonised_records:,}") + lines.append(f"Fact Records: {self.fact_records:,}") + lines.append("") + + lines.append("-" * 100) + lines.append("FILE SIZES") + lines.append("-" * 100) + lines.append(f"ZIP File: {self.zip_size_mb:,.2f} MB") + lines.append(f"GML File: {self.gml_size_mb:,.2f} MB") + lines.append(f"CSV File: {self.csv_size_mb:,.2f} MB") + lines.append("") + + lines.append("-" * 100) + lines.append("STEP TIMING SUMMARY") + lines.append("-" * 100) + lines.append(f"{'Step':<20} {'Duration':>12} {'% of Total':>12}") + lines.append("-" * 44) + for name, step in self.steps.items(): + pct = ( + (step.duration_seconds / self.total_duration_seconds * 100) + if self.total_duration_seconds > 0 + else 0 + ) + lines.append(f"{name:<20} {step.duration_seconds:>10.3f}s {pct:>10.1f}%") + lines.append("-" * 44) + lines.append( + f"{'TOTAL':<20} {self.total_duration_seconds:>10.3f}s {100.0:>10.1f}%" + ) + lines.append("") + + # COMBINED PHASE COMPARISON TABLE (if Polars was run) + if self.polars_phases: + lines.append("=" * 100) + lines.append("PHASE-BY-PHASE COMPARISON: ORIGINAL vs POLARS") + lines.append("=" * 100) + + # Show phase selection info if applicable + if self.selected_phases: + lines.append(f"Running selected phases: {sorted(self.selected_phases)}") + lines.append("") + + # Header + header = ( + f"{'#':<3} {'Phase Name':<26} {'Original':>11} {'Polars':>11} " + f"{'Speedup':>10} {'Time Saved':>12} {'Orig Out':>10} {'Polars Out':>10}" + ) + lines.append(header) + lines.append("-" * 100) + + # Build lookup for Polars phases by name + polars_by_name = {p.name: p for p in self.polars_phases} + + # Filter phases if selection is active + phases_to_display = self.phases + if self.selected_phases: + phases_to_display = [ + p for p in self.phases if p.phase_number in self.selected_phases + ] + + total_original = 0.0 + total_polars = 0.0 + total_saved = 0.0 + + for phase in phases_to_display: + polars_phase = polars_by_name.get(phase.name) + if polars_phase: + if polars_phase.duration_seconds > 0: + speedup = phase.duration_seconds / polars_phase.duration_seconds + else: + speedup = float("inf") if phase.duration_seconds > 0 else 1.0 + + saved = phase.duration_seconds - polars_phase.duration_seconds + speedup_str = f"{speedup:.1f}x" if speedup != float("inf") else "∞" + + phase_line = ( + f"{phase.phase_number:<3} {phase.name:<26} " + f"{phase.duration_seconds:>9.4f}s {polars_phase.duration_seconds:>9.4f}s " + f"{speedup_str:>9} {saved:>10.4f}s {phase.output_count:>10,} " + f"{polars_phase.output_count:>10,}" + ) + lines.append(phase_line) + + total_original += phase.duration_seconds + total_polars += polars_phase.duration_seconds + total_saved += saved + else: + lines.append( + f"{phase.phase_number:<3} {phase.name:<26} {phase.duration_seconds:>9.4f}s {'N/A':>11} {'N/A':>9} {'N/A':>12} {phase.output_count:>10,} {'N/A':>10}" + ) + + lines.append("-" * 100) + overall_speedup = total_original / total_polars if total_polars > 0 else 0 + lines.append( + f"{'':3} {'TOTAL TRANSFORM TIME':<26} {total_original:>9.4f}s {total_polars:>9.4f}s {overall_speedup:>8.1f}x {total_saved:>10.4f}s" + ) + lines.append("") + + # Overall summary + lines.append("-" * 100) + lines.append("PERFORMANCE SUMMARY") + lines.append("-" * 100) + lines.append(f"Original Pipeline: {total_original:.4f}s") + lines.append(f"Polars Pipeline: {total_polars:.4f}s") + lines.append(f"Speedup Factor: {overall_speedup:.1f}x faster") + lines.append( + f"Time Saved: {total_saved:.4f}s ({(total_saved/total_original*100):.1f}% reduction)" + ) + lines.append("") + + else: + lines.append("-" * 100) + lines.append("ORIGINAL PIPELINE - PHASE TIMING (Row-by-Row)") + lines.append("-" * 100) + + # Show phase selection info if applicable + if self.selected_phases: + lines.append(f"Running selected phases: {sorted(self.selected_phases)}") + lines.append("") + + lines.append( + f"{'#':<4} {'Phase Name':<30} {'Duration':>12} {'% of Transform':>14} {'Output':>10}" + ) + lines.append("-" * 74) + + # Filter phases if selection is active + phases_to_display = self.phases + if self.selected_phases: + phases_to_display = [ + p for p in self.phases if p.phase_number in self.selected_phases + ] + + for phase in phases_to_display: + pct = ( + (phase.duration_seconds / self.transform_duration_seconds * 100) + if self.transform_duration_seconds > 0 + else 0 + ) + lines.append( + f"{phase.phase_number:<4} {phase.name:<30} {phase.duration_seconds:>10.4f}s {pct:>12.1f}% {phase.output_count:>10,}" + ) + + lines.append("-" * 74) + lines.append( + f"{'':4} {'TOTAL TRANSFORM TIME':<30} {self.transform_duration_seconds:>10.4f}s {100.0:>12.1f}%" + ) + lines.append("") + + # Top 5 slowest phases (Original) + lines.append("-" * 100) + lines.append("TOP 5 SLOWEST PHASES (Original Pipeline)") + lines.append("-" * 100) + + # Filter phases for "top slowest" if selection is active + phases_for_top5 = self.phases + if self.selected_phases: + phases_for_top5 = [ + p for p in self.phases if p.phase_number in self.selected_phases + ] + + sorted_phases = sorted( + phases_for_top5, key=lambda x: x.duration_seconds, reverse=True + )[:5] + for i, phase in enumerate(sorted_phases, 1): + pct = ( + (phase.duration_seconds / self.transform_duration_seconds * 100) + if self.transform_duration_seconds > 0 + else 0 + ) + lines.append( + f" {i}. {phase.name:<30} {phase.duration_seconds:>10.4f}s ({pct:.1f}%)" + ) + lines.append("") + + # TOP SPEEDUP WINNERS (if Polars was run) + if self.polars_phases: + lines.append("-" * 100) + lines.append("TOP 5 SPEEDUP WINNERS (Biggest Improvements with Polars)") + lines.append("-" * 100) + + # Filter phases for speedup calculation if selection is active + phases_for_speedup = self.phases + if self.selected_phases: + phases_for_speedup = [ + p for p in self.phases if p.phase_number in self.selected_phases + ] + + polars_by_name = {p.name: p for p in self.polars_phases} + speedups = [] + for phase in phases_for_speedup: + polars_phase = polars_by_name.get(phase.name) + if polars_phase and phase.duration_seconds > 0.0001: + if polars_phase.duration_seconds > 0: + speedup = phase.duration_seconds / polars_phase.duration_seconds + else: + speedup = float("inf") + saved = phase.duration_seconds - polars_phase.duration_seconds + speedups.append( + ( + phase.name, + phase.duration_seconds, + polars_phase.duration_seconds, + speedup, + saved, + ) + ) + + speedups.sort(key=lambda x: x[4], reverse=True) + + for i, (name, orig, polars, spd, saved) in enumerate(speedups[:5], 1): + spd_str = f"{spd:.1f}x" if spd != float("inf") else "∞" + lines.append( + f" {i}. {name:<26} {orig:.4f}s → {polars:.4f}s ({spd_str} faster, {saved:.4f}s saved)" + ) + lines.append("") + + # THROUGHPUT METRICS + if ( + self.polars_phases + and self.input_records > 0 + and self.transform_duration_seconds > 0 + and self.polars_transform_seconds > 0 + ): + lines.append("-" * 100) + lines.append("THROUGHPUT METRICS") + lines.append("-" * 100) + orig_throughput = self.input_records / self.transform_duration_seconds + polars_throughput = self.input_records / self.polars_transform_seconds + lines.append(f"Original Pipeline: {orig_throughput:,.0f} records/second") + lines.append(f"Polars Pipeline: {polars_throughput:,.0f} records/second") + lines.append( + f"Throughput Gain: {polars_throughput - orig_throughput:,.0f} records/second faster" + ) + lines.append("") + + lines.append("=" * 100) + lines.append( + f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" + ) + lines.append("=" * 100) + + return "\n".join(lines) + + def save_json(self, path: Path): + """Save report as JSON file.""" + import json + + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w") as f: + json.dump(self.to_dict(), f, indent=2) + + def save_text(self, path: Path): + """Save report as text file.""" + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w") as f: + f.write(self.generate_text_report()) diff --git a/local_testing/pipeline_runner.py b/local_testing/pipeline_runner.py new file mode 100644 index 000000000..246737b23 --- /dev/null +++ b/local_testing/pipeline_runner.py @@ -0,0 +1,444 @@ +""" +Pipeline execution engine for title-boundary dataset. + +Handles running the full 26-phase digital-land transformation pipeline +with detailed timing and progress tracking. +""" + +import time +from pathlib import Path +from datetime import datetime +from typing import Dict + +from pipeline_config import PipelineConfig + + +class PipelineRunner: + """Executes the digital-land transformation pipeline with timing.""" + + def __init__(self, dataset: str = "title-boundary"): + """ + Initialize pipeline runner. + + Args: + dataset: Name of the dataset being processed + """ + self.dataset = dataset + self.pipeline_imports = None + + def get_pipeline_imports(self): + """ + Lazy import of digital-land pipeline modules. + + Returns dict of imported classes and functions. + """ + if self.pipeline_imports is not None: + return self.pipeline_imports + + from digital_land.phase.convert import ConvertPhase + from digital_land.phase.normalise import NormalisePhase + from digital_land.phase.parse import ParsePhase + from digital_land.phase.concat import ConcatFieldPhase + from digital_land.phase.filter import FilterPhase + from digital_land.phase.map import MapPhase + from digital_land.phase.patch import PatchPhase + from digital_land.phase.harmonise import HarmonisePhase + from digital_land.phase.default import DefaultPhase + from digital_land.phase.migrate import MigratePhase + from digital_land.phase.organisation import OrganisationPhase + from digital_land.phase.prune import ( + FieldPrunePhase, + EntityPrunePhase, + FactPrunePhase, + ) + from digital_land.phase.reference import ( + EntityReferencePhase, + FactReferencePhase, + ) + from digital_land.phase.prefix import EntityPrefixPhase + from digital_land.phase.lookup import EntityLookupPhase, FactLookupPhase + from digital_land.phase.priority import PriorityPhase + from digital_land.phase.pivot import PivotPhase + from digital_land.phase.combine import FactCombinePhase + from digital_land.phase.factor import FactorPhase + from digital_land.phase.save import SavePhase + from digital_land.pipeline.main import Pipeline + from digital_land.specification import Specification + from digital_land.organisation import Organisation + from digital_land.log import ( + IssueLog, + ColumnFieldLog, + DatasetResourceLog, + OperationalIssueLog, + ConvertedResourceLog, + ) + from digital_land.api import API + + self.pipeline_imports = { + "ConvertPhase": ConvertPhase, + "NormalisePhase": NormalisePhase, + "ParsePhase": ParsePhase, + "ConcatFieldPhase": ConcatFieldPhase, + "FilterPhase": FilterPhase, + "MapPhase": MapPhase, + "PatchPhase": PatchPhase, + "HarmonisePhase": HarmonisePhase, + "DefaultPhase": DefaultPhase, + "MigratePhase": MigratePhase, + "OrganisationPhase": OrganisationPhase, + "FieldPrunePhase": FieldPrunePhase, + "EntityPrunePhase": EntityPrunePhase, + "FactPrunePhase": FactPrunePhase, + "EntityReferencePhase": EntityReferencePhase, + "FactReferencePhase": FactReferencePhase, + "EntityPrefixPhase": EntityPrefixPhase, + "EntityLookupPhase": EntityLookupPhase, + "FactLookupPhase": FactLookupPhase, + "PriorityPhase": PriorityPhase, + "PivotPhase": PivotPhase, + "FactCombinePhase": FactCombinePhase, + "FactorPhase": FactorPhase, + "SavePhase": SavePhase, + "Pipeline": Pipeline, + "Specification": Specification, + "Organisation": Organisation, + "IssueLog": IssueLog, + "ColumnFieldLog": ColumnFieldLog, + "DatasetResourceLog": DatasetResourceLog, + "OperationalIssueLog": OperationalIssueLog, + "ConvertedResourceLog": ConvertedResourceLog, + "API": API, + } + + return self.pipeline_imports + + def run_full_pipeline( + self, + input_csv: Path, + output_dir: Path, + specification_dir: Path, + pipeline_dir: Path, + cache_dir: Path, + la_name: str, + report=None, + selected_phases=None, + ) -> Dict: + """ + Run the full 26-phase digital-land transformation pipeline. + + Args: + input_csv: Path to input CSV/Parquet file + output_dir: Directory for output files + specification_dir: Directory containing specification files + pipeline_dir: Directory containing pipeline configuration + cache_dir: Directory for cached resources + la_name: Local Authority name/slug + report: Optional PipelineReport instance for metrics tracking + selected_phases: Optional set of phase numbers (1-26) to run + + Returns: + Dict with results including file paths and record counts + """ + print(" Loading digital-land pipeline modules...") + p = self.get_pipeline_imports() + + # Convert Parquet to CSV if needed (original pipeline only supports CSV) + if input_csv.suffix.lower() == ".parquet": + import polars as pl + + csv_input = input_csv.with_suffix(".csv") + if not csv_input.exists(): + print(" Converting Parquet to CSV for original pipeline...") + pl.read_parquet(input_csv).write_csv(csv_input) + input_csv = csv_input + + # Set up output paths + harmonised_csv = output_dir / f"{la_name}_harmonised.csv" + facts_csv = output_dir / f"{la_name}_facts.csv" + issue_csv = output_dir / f"{la_name}_issues.csv" + + print(f" Input: {input_csv}") + print(f" Harmonised: {harmonised_csv}") + print(f" Facts: {facts_csv}") + + # Load configuration + specification = p["Specification"](str(specification_dir)) + pipeline = p["Pipeline"]( + str(pipeline_dir), self.dataset, specification=specification + ) + schema = specification.pipeline.get(pipeline.name, {}).get( + "schema", self.dataset + ) + intermediate_fieldnames = specification.intermediate_fieldnames(pipeline) + factor_fieldnames = specification.factor_fieldnames() + + # Create logs + resource = la_name.lower().replace(" ", "_") + issue_log = p["IssueLog"](dataset=self.dataset, resource=resource) + operational_issue_log = p["OperationalIssueLog"]( + dataset=self.dataset, resource=resource + ) + column_field_log = p["ColumnFieldLog"](dataset=self.dataset, resource=resource) + dataset_resource_log = p["DatasetResourceLog"]( + dataset=self.dataset, resource=resource + ) + converted_resource_log = p["ConvertedResourceLog"]( + dataset=self.dataset, resource=resource + ) + + # Load organization data + org_csv = PipelineConfig.download_organisation_csv(cache_dir) + organisation = p["Organisation"]( + organisation_path=str(org_csv), pipeline_dir=Path(pipeline_dir) + ) + api = p["API"](specification=specification) + + # Get configuration + entity_range_min = specification.get_dataset_entity_min(self.dataset) + entity_range_max = specification.get_dataset_entity_max(self.dataset) + endpoints = [] + organisations_list = ["government-organisation:D2"] + entry_date = datetime.now().strftime("%Y-%m-%d") + + # Get pipeline configuration + skip_patterns = pipeline.skip_patterns(resource, endpoints) + columns = pipeline.columns(resource, endpoints=endpoints) + concats = pipeline.concatenations(resource, endpoints=endpoints) + patches = pipeline.patches(resource=resource, endpoints=endpoints) + lookups = pipeline.lookups(resource=resource) + default_fields = pipeline.default_fields(resource=resource, endpoints=endpoints) + default_values = pipeline.default_values(endpoints=endpoints) + combine_fields = pipeline.combine_fields(endpoints=endpoints) + redirect_lookups = pipeline.redirect_lookups() + migrations = pipeline.migrations() + config = None + valid_category_values = api.get_valid_category_values(self.dataset, pipeline) + + if len(organisations_list) == 1: + default_values["organisation"] = organisations_list[0] + if entry_date and "entry-date" not in default_values: + default_values["entry-date"] = entry_date + + field_datatype_map = specification.get_field_datatype_map() + field_typology_map = specification.get_field_typology_map() + field_prefix_map = specification.get_field_prefix_map() + dataset_prefix = specification.dataset_prefix(self.dataset) + + print(" Running 26-phase pipeline with per-phase timing...") + + # Define phase creators + phase_creators = [ + ( + 1, + "ConvertPhase", + lambda: p["ConvertPhase"]( + path=str(input_csv), + dataset_resource_log=dataset_resource_log, + converted_resource_log=converted_resource_log, + ), + ), + ( + 2, + "NormalisePhase", + lambda: p["NormalisePhase"](skip_patterns=skip_patterns), + ), + (3, "ParsePhase", lambda: p["ParsePhase"]()), + ( + 4, + "ConcatFieldPhase", + lambda: p["ConcatFieldPhase"](concats=concats, log=column_field_log), + ), + ( + 5, + "FilterPhase-1", + lambda: p["FilterPhase"](filters=pipeline.filters(resource)), + ), + ( + 6, + "MapPhase", + lambda: p["MapPhase"]( + fieldnames=intermediate_fieldnames, + columns=columns, + log=column_field_log, + ), + ), + ( + 7, + "FilterPhase-2", + lambda: p["FilterPhase"]( + filters=pipeline.filters(resource, endpoints=endpoints) + ), + ), + ( + 8, + "PatchPhase", + lambda: p["PatchPhase"](issues=issue_log, patches=patches), + ), + ( + 9, + "HarmonisePhase", + lambda: p["HarmonisePhase"]( + field_datatype_map=field_datatype_map, + issues=issue_log, + dataset=self.dataset, + valid_category_values=valid_category_values, + ), + ), + ( + 10, + "DefaultPhase", + lambda: p["DefaultPhase"]( + default_fields=default_fields, + default_values=default_values, + issues=issue_log, + ), + ), + ( + 11, + "MigratePhase", + lambda: p["MigratePhase"]( + fields=specification.schema_field[schema], migrations=migrations + ), + ), + ( + 12, + "OrganisationPhase", + lambda: p["OrganisationPhase"]( + organisation=organisation, issues=issue_log + ), + ), + ( + 13, + "FieldPrunePhase", + lambda: p["FieldPrunePhase"]( + fields=specification.current_fieldnames(schema) + ), + ), + ( + 14, + "EntityReferencePhase", + lambda: p["EntityReferencePhase"]( + dataset=self.dataset, prefix=dataset_prefix, issues=issue_log + ), + ), + ( + 15, + "EntityPrefixPhase", + lambda: p["EntityPrefixPhase"](dataset=self.dataset), + ), + ( + 16, + "EntityLookupPhase", + lambda: p["EntityLookupPhase"]( + lookups=lookups, + redirect_lookups=redirect_lookups, + issue_log=issue_log, + operational_issue_log=operational_issue_log, + entity_range=[entity_range_min, entity_range_max], + ), + ), + ( + 17, + "SavePhase-harmonised", + lambda: p["SavePhase"]( + str(harmonised_csv), + fieldnames=intermediate_fieldnames, + enabled=True, + ), + ), + ( + 18, + "EntityPrunePhase", + lambda: p["EntityPrunePhase"]( + dataset_resource_log=dataset_resource_log + ), + ), + ( + 19, + "PriorityPhase", + lambda: p["PriorityPhase"](config=config, providers=organisations_list), + ), + (20, "PivotPhase", lambda: p["PivotPhase"]()), + ( + 21, + "FactCombinePhase", + lambda: p["FactCombinePhase"]( + issue_log=issue_log, fields=combine_fields + ), + ), + (22, "FactorPhase", lambda: p["FactorPhase"]()), + ( + 23, + "FactReferencePhase", + lambda: p["FactReferencePhase"]( + field_typology_map=field_typology_map, + field_prefix_map=field_prefix_map, + ), + ), + ( + 24, + "FactLookupPhase", + lambda: p["FactLookupPhase"]( + lookups=lookups, + redirect_lookups=redirect_lookups, + issue_log=issue_log, + odp_collections=specification.get_odp_collections(), + ), + ), + (25, "FactPrunePhase", lambda: p["FactPrunePhase"]()), + ( + 26, + "SavePhase-facts", + lambda: p["SavePhase"](str(facts_csv), fieldnames=factor_fieldnames), + ), + ] + + # Run phases with timing + stream_data = [] + total_start = time.time() + + for phase_num, phase_name, phase_creator in phase_creators: + phase = phase_creator() + phase_start = time.time() + + if phase_num == 1: + output_stream = phase.process(iter([])) + else: + output_stream = phase.process(iter(stream_data)) + + stream_data = list(output_stream) + duration = time.time() - phase_start + output_count = len(stream_data) + + if report: + metrics = report.add_phase(phase_name, phase_num) + metrics.duration_seconds = duration + metrics.output_count = output_count + + if duration > 0.1: + print( + f" Phase {phase_num:2d}: {phase_name:<25} {duration:8.4f}s ({output_count:,} rows)" + ) + + total_transform_time = time.time() - total_start + print(f" Total transform time: {total_transform_time:.3f}s") + + # Count results + harmonised_count = ( + sum(1 for _ in open(harmonised_csv)) - 1 if harmonised_csv.exists() else 0 + ) + facts_count = sum(1 for _ in open(facts_csv)) - 1 if facts_csv.exists() else 0 + issue_log.save(str(issue_csv)) + + if report: + report.harmonised_records = harmonised_count + report.fact_records = facts_count + + return { + "harmonised": harmonised_count, + "facts": facts_count, + "harmonised_path": str(harmonised_csv), + "facts_path": str(facts_csv), + "issues_path": str(issue_csv), + "transform_time": total_transform_time, + } diff --git a/local_testing/run_all.py b/local_testing/run_all.py new file mode 100755 index 000000000..f58fb13d2 --- /dev/null +++ b/local_testing/run_all.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +""" +Script to run the pipeline for all Local Authorities. +""" + +import sys +import subprocess +import time +import json +from pathlib import Path +from datetime import datetime +from cli import CLI + + +def main(): + """Run pipeline for all Local Authorities.""" + # Get limit from command line if provided + limit = None + if len(sys.argv) > 1: + limit = sys.argv[1] + + # Fetch all endpoints + print("Fetching endpoint list...") + endpoints = CLI.fetch_endpoint_list() + print(f"Found {len(endpoints)} Local Authorities") + print("Running with Polars comparison enabled\n") + + success_count = 0 + error_count = 0 + errors = [] + la_times = [] + batch_start = time.time() + + for i, ep in enumerate(endpoints, 1): + la = ep.get("local_authority", "Unknown") + print(f"\n{'='*60}") + print(f"[{i}/{len(endpoints)}] Processing: {la}") + print(f"{'='*60}") + + # Build command with --compare flag for Polars + cmd = [sys.executable, "main.py", "--la", la, "--compare"] + if limit: + cmd.extend(["--limit", limit]) + + # Time this LA + la_start = time.time() + result = subprocess.run(cmd) + la_duration = time.time() - la_start + + if result.returncode != 0: + print(f" ⚠️ Error processing {la}") + error_count += 1 + errors.append(la) + la_times.append({"la": la, "duration": la_duration, "status": "error"}) + else: + print(f" ✅ Completed {la} ({la_duration:.1f}s)") + success_count += 1 + la_times.append({"la": la, "duration": la_duration, "status": "success"}) + + # Calculate batch metrics + batch_duration = time.time() - batch_start + avg_duration = ( + sum(t["duration"] for t in la_times) / len(la_times) if la_times else 0 + ) + successful_times = [t["duration"] for t in la_times if t["status"] == "success"] + + # Summary + print(f"\n{'='*60}") + print("BATCH PROCESSING COMPLETE (with Polars Comparison)") + print(f"{'='*60}") + print(f" Total LAs: {len(endpoints)}") + print(f" Success: {success_count}") + print(f" Errors: {error_count}") + print(f" Total Time: {batch_duration:.1f}s ({batch_duration/60:.1f}m)") + print(f" Avg Time/LA: {avg_duration:.1f}s") + if successful_times: + print(f" Min Time: {min(successful_times):.1f}s") + print(f" Max Time: {max(successful_times):.1f}s") + print("\n Note: All LAs processed with both Original + Polars pipelines") + + if errors: + print("\nFailed Local Authorities:") + for la in errors: + print(f" - {la}") + + # Save batch report + reports_dir = Path(__file__).parent / "reports" + reports_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + batch_report = { + "batch_timestamp": timestamp, + "total_las": len(endpoints), + "success_count": success_count, + "error_count": error_count, + "batch_duration_seconds": batch_duration, + "average_duration_seconds": avg_duration, + "polars_comparison_enabled": True, + "limit": limit, + "la_results": la_times, + "errors": errors, + } + + batch_json = reports_dir / f"batch_{timestamp}_summary.json" + with open(batch_json, "w") as f: + json.dump(batch_report, f, indent=2) + + print(f"\nBatch report saved: {batch_json}") + print(f"{'='*60}\n") + + return 1 if error_count > 0 else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/pyproject.toml b/pyproject.toml index 19d458a49..dfeea6b87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "boto3", "moto", "psutil", + "polars", ] classifiers = [ diff --git a/setup.cfg b/setup.cfg index 80f6adc2f..2de2c1af6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [flake8] max-line-length = 180 -ignore = E203, W503 +ignore = E203, W503, F541, W291 exclude = .venv,.git,__pycache__,docs/source/conf.py,old,build,dist,.direnv [pycodestyle] diff --git a/test_polars_phases.py b/test_polars_phases.py new file mode 100644 index 000000000..f5ff0bed3 --- /dev/null +++ b/test_polars_phases.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python3 +""" +Test script for Polars-based pipeline phases. + +Creates a simple CSV, runs each polars phase individually and in chain, +and verifies the output matches expectations. +""" + +import os +import sys +import tempfile +import logging + +logging.basicConfig(level=logging.DEBUG, format="%(name)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +# ── Create test CSV ────────────────────────────────────────────────────────── +TEST_CSV_CONTENT = """\ +reference,name,geometry,documentation-url,start-date,organisation,entry-date +ref-001,Test Area One,MULTIPOLYGON(((-0.1 51.5,-0.1 51.6,-0.2 51.6,-0.2 51.5,-0.1 51.5))),https://example.com/doc1,2024-01-15,local-authority-eng:example,2024-01-15 +ref-002,Test Area Two,MULTIPOLYGON(((-0.3 51.5,-0.3 51.6,-0.4 51.6,-0.4 51.5,-0.3 51.5))),https://example.com/doc2,2024-02-20,local-authority-eng:example,2024-02-20 +ref-003," Test Area Three ",MULTIPOLYGON(((-0.5 51.5,-0.5 51.6,-0.6 51.6,-0.6 51.5,-0.5 51.5))),https://example.com/doc3,2024-03-10,local-authority-eng:example,2024-03-10 +""" + +tmp_dir = tempfile.mkdtemp(prefix="polars_phases_test_") +input_csv = os.path.join(tmp_dir, "test_input.csv") +output_csv = os.path.join(tmp_dir, "test_output.csv") + +with open(input_csv, "w") as f: + f.write(TEST_CSV_CONTENT) + +print(f"Test data written to: {input_csv}") +print(f"Output will go to: {output_csv}") + +# ── Import polars phases ──────────────────────────────────────────────────── +import polars as pl + +from digital_land.phase_polars import ( + run_polars_pipeline, + ConvertPhase, + NormalisePhase, + ConcatFieldPhase, + FilterPhase, + MapPhase, + PatchPhase, + HarmonisePhase, + DefaultPhase, + MigratePhase, + OrganisationPhase, + FieldPrunePhase, + EntityPrunePhase, + FactPrunePhase, + EntityReferencePhase, + EntityPrefixPhase, + EntityLookupPhase, + FactLookupPhase, + SavePhase, + PivotPhase, + FactCombinePhase, + FactorPhase, + PriorityPhase, + DumpPhase, + LoadPhase, +) +from digital_land.log import DatasetResourceLog, ConvertedResourceLog, ColumnFieldLog, IssueLog + +passed = 0 +failed = 0 + + +def check(name, condition, detail=""): + global passed, failed + if condition: + print(f" PASS: {name}") + passed += 1 + else: + print(f" FAIL: {name} {detail}") + failed += 1 + + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 1: ConvertPhase — loads CSV into DataFrame +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 1: ConvertPhase ──") +dataset_resource_log = DatasetResourceLog() +converted_resource_log = ConvertedResourceLog() +convert_phase = ConvertPhase( + path=input_csv, + dataset_resource_log=dataset_resource_log, + converted_resource_log=converted_resource_log, +) +df = convert_phase.process() +check("returns DataFrame", isinstance(df, pl.DataFrame)) +check("has 3 rows", df.height == 3, f"got {df.height}") +check("has __resource column", "__resource" in df.columns) +check("has __line_number column", "__line_number" in df.columns) +check("has reference column", "reference" in df.columns) +print(f" Columns: {[c for c in df.columns if not c.startswith('__')]}") +print(f" Shape: {df.shape}") + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 2: NormalisePhase — strips whitespace +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 2: NormalisePhase ──") +normalise_phase = NormalisePhase(skip_patterns=[]) +df2 = normalise_phase.process(df) +check("preserves row count", df2.height == 3) +# Check that whitespace was stripped from " Test Area Three " +names = df2["name"].to_list() +check("whitespace stripped", "Test Area Three" in names, f"got {names}") + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 3: MapPhase — renames columns +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 3: MapPhase ──") +fieldnames = [ + "reference", "name", "geometry", "documentation-url", + "start-date", "organisation", "entry-date", "point", + "entity", "prefix", "end-date", +] +column_field_log = ColumnFieldLog() +map_phase = MapPhase(fieldnames=fieldnames, columns={}, log=column_field_log) +df3 = map_phase.process(df2) +check("preserves row count", df3.height == 3) +check("has reference column", "reference" in df3.columns) +data_cols = [c for c in df3.columns if not c.startswith("__")] +print(f" Mapped columns: {data_cols}") + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 4: PatchPhase — applies patches +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 4: PatchPhase ──") +issue_log = IssueLog(dataset="test-dataset", resource="test-resource") +patch_phase = PatchPhase(issues=issue_log, patches={}) +df4 = patch_phase.process(df3) +check("no patches, same rows", df4.height == 3) + +# Test with actual patches +patch_with_data = PatchPhase( + issues=issue_log, + patches={"name": {"Test Area One": "Patched Area One"}}, +) +df4b = patch_with_data.process(df3) +names_patched = df4b["name"].to_list() +check("patch applied", "Patched Area One" in names_patched, f"got {names_patched}") + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 5: DefaultPhase — applies defaults +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 5: DefaultPhase ──") +default_phase = DefaultPhase( + issues=issue_log, + default_values={"end-date": ""}, +) +# Add an empty end-date column +df5_in = df4.with_columns(pl.lit("").alias("end-date")) +df5 = default_phase.process(df5_in) +check("preserves rows", df5.height == 3) + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 6: FilterPhase +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 6: FilterPhase ──") +filter_phase = FilterPhase(filters={}) +df6 = filter_phase.process(df5) +check("no filter, same rows", df6.height == 3) + +filter_with_data = FilterPhase(filters={"reference": "ref-001"}) +df6b = filter_with_data.process(df5) +check("filter applied", df6b.height == 1, f"got {df6b.height}") + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 7: MigratePhase +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 7: MigratePhase ──") +migrate_phase = MigratePhase( + fields=["reference", "name", "geometry", "documentation-url", + "start-date", "organisation", "entry-date", "end-date"], + migrations={}, +) +df7 = migrate_phase.process(df6) +check("preserves rows", df7.height == 3) +check("has reference", "reference" in df7.columns) + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 8: EntityReferencePhase + EntityPrefixPhase +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 8: EntityReferencePhase + EntityPrefixPhase ──") +ref_phase = EntityReferencePhase(dataset="test-dataset", prefix="test-dataset", issues=issue_log) +df8 = ref_phase.process(df7) +check("has prefix", "prefix" in df8.columns) +check("has reference", "reference" in df8.columns) +prefixes = df8["prefix"].to_list() +check("prefix set", all(p == "test-dataset" for p in prefixes), f"got {prefixes}") + +prefix_phase = EntityPrefixPhase(dataset="test-dataset") +df8b = prefix_phase.process(df8) +check("prefix still set", all(p == "test-dataset" for p in df8b["prefix"].to_list())) + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 9: FieldPrunePhase +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 9: FieldPrunePhase ──") +prune_phase = FieldPrunePhase(fields=["reference", "name", "geometry", "organisation"]) +df9 = prune_phase.process(df8b) +data_cols9 = [c for c in df9.columns if not c.startswith("__")] +check("pruned to expected fields", len(data_cols9) <= 8, f"got {data_cols9}") +check("has reference", "reference" in df9.columns) + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 10: EntityLookupPhase +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 10: EntityLookupPhase ──") +from digital_land.phase_polars.lookup import key as lookup_key +lookups = { + lookup_key(prefix="test-dataset", reference="ref-001"): "1000001", + lookup_key(prefix="test-dataset", reference="ref-002"): "1000002", + lookup_key(prefix="test-dataset", reference="ref-003"): "1000003", +} +lookup_phase = EntityLookupPhase( + lookups=lookups, + redirect_lookups={}, + issue_log=issue_log, + entity_range=[1000000, 2000000], +) +df10 = lookup_phase.process(df9) +check("has entity column", "entity" in df10.columns) +entities = df10["entity"].to_list() +check("entities assigned", "1000001" in entities, f"got {entities}") +check("all entities assigned", all(e for e in entities), f"got {entities}") + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 11: EntityPrunePhase +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 11: EntityPrunePhase ──") +dataset_resource_log2 = DatasetResourceLog(dataset="test-dataset", resource="test-resource") +entity_prune = EntityPrunePhase(dataset_resource_log=dataset_resource_log2) +df11 = entity_prune.process(df10) +check("all rows kept (all have entities)", df11.height == 3) +check("entry count logged", dataset_resource_log2.entry_count == 3) + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 12: PriorityPhase +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 12: PriorityPhase ──") +priority_phase = PriorityPhase(config=None, providers=[]) +df12 = priority_phase.process(df11) +check("has __priority", "__priority" in df12.columns) + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 13: PivotPhase +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 13: PivotPhase ──") +pivot_phase = PivotPhase() +df13 = pivot_phase.process(df12) +check("pivoted to facts", df13.height > 3, f"got {df13.height} rows (should be > 3)") +check("has fact column", "fact" in df13.columns) +check("has field column", "field" in df13.columns) +check("has value column", "value" in df13.columns) +print(f" Pivoted to {df13.height} fact rows from 3 entity rows") + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 14: FactorPhase +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 14: FactorPhase ──") +factor_phase = FactorPhase() +df14 = factor_phase.process(df13) +facts = df14["fact"].to_list() +non_empty_facts = [f for f in facts if f] +check("fact hashes generated", len(non_empty_facts) > 0, f"got {len(non_empty_facts)}") +check("fact is sha256 hex", len(non_empty_facts[0]) == 64 if non_empty_facts else False) + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 15: FactPrunePhase +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 15: FactPrunePhase ──") +fact_prune = FactPrunePhase() +df15 = fact_prune.process(df14) +check("facts pruned (empty values removed)", df15.height <= df14.height) +print(f" Before: {df14.height} → After: {df15.height}") + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 16: SavePhase +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 16: SavePhase ──") +save_phase = SavePhase(path=output_csv, fieldnames=["entity", "fact", "field", "value"]) +df16 = save_phase.process(df15) +check("CSV file created", os.path.exists(output_csv)) +if os.path.exists(output_csv): + import csv + with open(output_csv) as f: + reader = csv.DictReader(f) + rows = list(reader) + check("CSV has rows", len(rows) > 0, f"got {len(rows)}") + check("CSV has entity column", "entity" in rows[0]) + print(f" Saved {len(rows)} rows to {output_csv}") + +# ═══════════════════════════════════════════════════════════════════════════════ +# TEST 17: run_polars_pipeline (chained execution) +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n── Test 17: run_polars_pipeline (chained) ──") +chain_output = os.path.join(tmp_dir, "chain_output.csv") +result_df = run_polars_pipeline( + ConvertPhase(path=input_csv), + NormalisePhase(), + MapPhase(fieldnames=fieldnames, columns={}), + FilterPhase(filters={}), + SavePhase(path=chain_output, enabled=True), +) +check("chain returns DataFrame", isinstance(result_df, pl.DataFrame)) +check("chain output file exists", os.path.exists(chain_output)) +if os.path.exists(chain_output): + result_check = pl.read_csv(chain_output) + check("chain output has 3 rows", result_check.height == 3, f"got {result_check.height}") + +# ═══════════════════════════════════════════════════════════════════════════════ +# SUMMARY +# ═══════════════════════════════════════════════════════════════════════════════ +print("\n" + "=" * 70) +print(f"RESULTS: {passed} passed, {failed} failed out of {passed + failed} checks") +print("=" * 70) + +if failed > 0: + print("\nSome tests FAILED!") + sys.exit(1) +else: + print("\nAll tests PASSED!") + sys.exit(0)