Imageomics · Dahlializi · Nov 17, 2025 · Nov 22, 2025 · Nov 24, 2025 · Jan 22, 2026
diff --git a/.github/workflows/deploy-docs.yaml b/.github/workflows/deploy-docs.yaml
@@ -0,0 +1,103 @@
+name: Build & Deploy MkDocs (gh-pages with PR previews)
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [ main ]
+    types: [opened, synchronize, reopened, closed]
+  push:
+    branches: [ main ]
+
+permissions:
+  contents: write
+  pages: write
+
+jobs:
+  build:
+    # Run for push, workflow dispatch, PRs from SAME repo that are not closed
+    if: |
+      github.event_name == 'push' ||
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' &&
+       github.event.pull_request.head.repo.fork == false &&
+       github.event.action != 'closed')
+    runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}
+      cancel-in-progress: true
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install deps
+        run: |
+          python -m pip install --upgrade pip
+          pip install '.[docs]'
+      - name: Build with MkDocs
+        run: mkdocs build
+      - name: Upload built site as artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: site
+          path: ./site
+
+  deploy:
+    needs: build
+    # Deploy on push to main (root) or PRs from SAME repo (not closed) -> pr-<N>/
+    if: |
+      github.event_name == 'push' ||
+      (github.event_name == 'pull_request' &&
+       github.event.pull_request.head.repo.fork == false &&
+       github.event.action != 'closed')
+    runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}
+      cancel-in-progress: true
+    steps:
+      - name: Download built site
+        uses: actions/download-artifact@v4
+        with:
+          name: site
+          path: ./site
+      - name: Deploy to gh-pages
+        uses: peaceiris/actions-gh-pages@v4
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_branch: gh-pages
+          publish_dir: ./site
+          keep_files: true
+          destination_dir: ${{ github.event_name == 'pull_request' && format('pr-{0}', github.event.number) || '' }}
+
+  cleanup:
+    # Only when a same-repo PR closes
+    if: >
+      github.event_name == 'pull_request' &&
+      github.event.pull_request.head.repo.fork == false &&
+      github.event.action == 'closed'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout gh-pages
+        uses: actions/checkout@v4
+        with:
+          ref: gh-pages
+          fetch-depth: 0
+      - name: Configure git author
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+      - name: Remove preview folder
+        shell: bash
+        run: |
+          set -euo pipefail
+          PR_DIR="pr-${{ github.event.number }}"
+          echo "Attempting to remove $PR_DIR"
+          if [ -d "$PR_DIR" ]; then
+            git rm -r "$PR_DIR"
+            git commit -m "Remove preview for PR #${{ github.event.number }}"
+            git push origin gh-pages
+          else
+            echo "No preview folder $PR_DIR found; nothing to do."
+          fi
diff --git a/docs/_assets/taxonopy_banner.svg b/docs/_assets/taxonopy_banner.svg
diff --git a/docs/_assets/taxonopy_logo.svg b/docs/_assets/taxonopy_logo.svg
diff --git a/docs/_scripts/gen_cli_help_docs.py b/docs/_scripts/gen_cli_help_docs.py
@@ -0,0 +1,61 @@
+"""Generate CLI help pages for MkDocs without shell execution."""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+import mkdocs_gen_files
+
+ROOT = Path(__file__).resolve().parents[2]
+SRC = ROOT / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+
+from taxonopy.cli import create_parser  # noqa: E402
+
+
+def get_subparser(parser: argparse.ArgumentParser, name: str) -> argparse.ArgumentParser:
+    for action in parser._actions:
+        if isinstance(action, argparse._SubParsersAction):
+            if name in action.choices:
+                return action.choices[name]
+    raise KeyError(f"Subparser '{name}' not found")
+
+
+def render_section(title: str, help_text: str) -> str:
+    return f"## `{title}`\n\n```console\n{help_text.rstrip()}\n```\n"
+
+
+def main() -> None:
+    parser = create_parser()
+    parser.prog = "taxonopy"
+    resolve_parser = get_subparser(parser, "resolve")
+    trace_parser = get_subparser(parser, "trace")
+    trace_entry_parser = get_subparser(trace_parser, "entry")
+    common_parser = get_subparser(parser, "common-names")
+
+    resolve_parser.prog = "taxonopy resolve"
+    trace_parser.prog = "taxonopy trace"
+    trace_entry_parser.prog = "taxonopy trace entry"
+    common_parser.prog = "taxonopy common-names"
+
+    sections = [
+        ("taxonopy --help", parser.format_help()),
+        ("taxonopy resolve --help", resolve_parser.format_help()),
+        ("taxonopy trace --help", trace_parser.format_help()),
+        ("taxonopy trace entry --help", trace_entry_parser.format_help()),
+        ("taxonopy common-names --help", common_parser.format_help()),
+    ]
+
+    with mkdocs_gen_files.open("command_line_usage/help.md", "w") as file_handle:
+        file_handle.write("# Help\n\n")
+        file_handle.write("Command reference for the TaxonoPy CLI.\n\n")
+        for title, help_text in sections:
+            file_handle.write(render_section(title, help_text))
+            file_handle.write("\n")
+
+
+if __name__ in {"__main__", "<run_path>"}:
+    main()
diff --git a/docs/acknowledgements.md b/docs/acknowledgements.md
@@ -0,0 +1,3 @@
+# Acknowledgments
+
+The [Imageomics Institute](https://imageomics.org/) is supported by the National Science Foundation under [Award No. 2118240](https://www.nsf.gov/awardsearch/showAward?AWD_ID=2118240) "HDR Institute: Imageomics: A New Frontier of Biological Information Powered by Knowledge-Guided Machine Learning." Any opinions, findings and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation.
diff --git a/docs/command_line_usage/help.md b/docs/command_line_usage/help.md
@@ -0,0 +1,29 @@
+# Help
+You may view the help for the command line interface by running:
+
+```bash
+taxonopy --help
+```
+This will show you the available commands and options:
+
+```
+usage: taxonopy [-h] [--cache-dir CACHE_DIR] [--show-cache-path] [--cache-stats] [--clear-cache] [--show-config] [--version] {resolve,trace,common-names} ...
+
+TaxonoPy: Resolve taxonomic names using GNVerifier and trace data provenance.
+
+positional arguments:
+  {resolve,trace,common-names}
+    resolve             Run the taxonomic resolution workflow
+    trace               Trace data provenance of TaxonoPy objects
+    common-names        Merge vernacular names (post-process) into resolved outputs
+
+options:
+  -h, --help            show this help message and exit
+  --cache-dir CACHE_DIR
+                        Directory for TaxonoPy cache (can also be set with TAXONOPY_CACHE_DIR environment variable) (default: None)
+  --show-cache-path     Display the current cache directory path and exit (default: False)
+  --cache-stats         Display statistics about the cache and exit (default: False)
+  --clear-cache         Clear the TaxonoPy object cache. May be used in isolation. (default: False)
+  --show-config         Show current configuration and exit (default: False)
+  --version             Show version number and exit
+```
diff --git a/docs/command_line_usage/tutorial.md b/docs/command_line_usage/tutorial.md
@@ -0,0 +1,90 @@
+# Command Line Tutorial
+
+**Command ```resolve```:**
+The ```resolve``` command is used to perform taxonomic resolution on a dataset. It takes a directory of Parquet partitions as input and outputs a directory of resolved Parquet partitions.
+```
+usage: taxonopy resolve [-h] -i INPUT -o OUTPUT_DIR [--output-format {csv,parquet}] [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [--log-file LOG_FILE] [--force-input] [--batch-size BATCH_SIZE] [--all-matches]
+                        [--capitalize] [--fuzzy-uninomial] [--fuzzy-relaxed] [--species-group] [--refresh-cache]
+
+options:
+  -h, --help            show this help message and exit
+  -i, --input INPUT     Path to input Parquet or CSV file/directory
+  -o, --output-dir OUTPUT_DIR
+                        Directory to save resolved and unsolved output files
+  --output-format {csv,parquet}
+                        Output file format
+  --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
+                        Set logging level
+  --log-file LOG_FILE   Optional file to write logs to
+  --force-input         Force use of input metadata without resolution
+
+GNVerifier Settings:
+  --batch-size BATCH_SIZE
+                        Max number of name queries per GNVerifier API/subprocess call
+  --all-matches         Return all matches instead of just the best one
+  --capitalize          Capitalize the first letter of each name
+  --fuzzy-uninomial     Enable fuzzy matching for uninomial names
+  --fuzzy-relaxed       Relax fuzzy matching criteria
+  --species-group       Enable group species matching
+
+Cache Management:
+  --refresh-cache       Force refresh of cached objects (input parsing, grouping) before running.
+```
+It is recommended to keep GNVerifier settings at their defaults.
+
+**Command ```trace```**:
+The ```trace``` command is used to trace the provenance of a taxonomic entry. It takes a UUID and an input path as arguments and outputs the full path of the entry through TaxonoPy.
+```
+usage: taxonopy trace [-h] {entry} ...
+
+positional arguments:
+  {entry}
+    entry     Trace an individual taxonomic entry by UUID
+
+options:
+  -h, --help  show this help message and exit
+
+usage: taxonopy trace entry [-h] --uuid UUID --from-input FROM_INPUT [--format {json,text}] [--verbose]
+
+options:
+  -h, --help            show this help message and exit
+  --uuid UUID           UUID of the taxonomic entry
+  --from-input FROM_INPUT
+                        Path to the original input dataset
+  --format {json,text}  Output format
+  --verbose             Show full details including all UUIDs in group
+```
+
+**Command ```common-names```:**
+The ```common-names``` command is used to merge vernacular names into the resolved output. It takes a directory of resolved Parquet partitions as input and outputs a directory of resolved Parquet partitions with common names.
+
+```
+usage: taxonopy common-names [-h] --resolved-dir ANNOTATION_DIR --output-dir OUTPUT_DIR
+
+options:
+  -h, --help            show this help message and exit
+  --resolved-dir ANNOTATION_DIR
+                        Directory containing your *.resolved.parquet files
+  --output-dir OUTPUT_DIR
+                        Directory to write annotated .parquet files
+```
+
+Note that the ```common-names``` command is a post-processing step and should be run after the ```resolve``` command.
+
+## Example Usage
+To perform taxonomic resolution on a dataset with subsequent common name annotation, run:
+```
+taxonopy resolve \
+    --input /path/to/formatted/input \
+    --output-dir /path/to/resolved/output
+```
+```
+taxonopy common-names \
+    --resolved-dir /path/to/resolved/output \
+    --output-dir /path/to/resolved_with_common-names/output
+```
+TaxonoPy creates a cache of the objects associated with input entries for use with the ```trace``` command. By default, this cache is stored in the ```~/.cache/taxonopy``` directory.
+
+## Development
+
+See the [Wiki Development Page](https://github.com/Imageomics/TaxonoPy/wiki/Development) for development instructions.
diff --git a/docs/development/contributing/index.md b/docs/development/contributing/index.md
@@ -0,0 +1,5 @@
+# Contributing
+
+We welcome contributions to TaxonoPy. More detailed guidance will be added here.
+
+If you have suggestions or run into a bug, please open an issue at [https://github.com/Imageomics/TaxonoPy/issues](https://github.com/Imageomics/TaxonoPy/issues).
diff --git a/docs/index.md b/docs/index.md
@@ -0,0 +1,68 @@
+---
+title: Home
+hide:
+  - title
+---
+
+# TaxonoPy {: .taxonopy-home-title }
+
+![TaxonoPy banner](_assets/taxonopy_banner.svg)
+
+<h2 style="text-align:center; margin-top:0;">Reproducibly Aligned Biological Taxonomies</h2>
+<div align="center">
+  <a href="https://doi.org/10.5281/zenodo.15499454">
+    <img src="https://zenodo.org/badge/789041700.svg" alt="DOI">
+  </a>
+  <a href="https://pypi.org/project/taxonopy">
+    <img src="https://img.shields.io/pypi/v/taxonopy.svg" alt="PyPI - Version">
+  </a>
+  <a href="https://pypi.org/project/taxonopy">
+    <img src="https://img.shields.io/pypi/pyversions/taxonopy.svg" alt="PyPI - Python Version">
+  </a>
+</div>
+
+TaxonoPy (taxon-o-py) is a command-line tool for creating reproducibly aligned biological taxonomies using the [Global Names Verifier (gnverifier)](https://github.com/gnames/gnverifier).
+
+## Package Purpose
+TaxonoPy aligns data to a single, internally consistent 7-rank Linnaean taxonomic hierarchy across large biodiversity datasets assembled from multiple providers, each of which may use overlapping but nonuniform taxonomies. The goal is AI-ready biodiversity data with clean, aligned taxonomy.
+
+Its development has been driven by its application in the [TreeOfLife-200M dataset](https://huggingface.co/datasets/imageomics/TreeOfLife-200M). This dataset contains over 200 million labeled images of organisms from four core data providers:
+
+- [The Global Biodiversity Information Facility (GBIF)](https://www.gbif.org/)
+- [BIOSCAN-5M](https://biodiversitygenomics.net/projects/5m-insects/)
+- [FathomNet](https://www.fathomnet.org/)
+- [The Encyclopedia of Life (EOL)](https://eol.org/)
+
+Across these resources, taxon names and classifications often conflict. TaxonoPy resolves those differences into a coherent, standardized taxonomy for the combined dataset.
+
+## Challenges
+The taxonomy information is provided by each data provider and original sources, but the classification can be:
+
+- **Inconsistent** — between and within sources (e.g., kingdom *Metazoa* vs. *Animalia*)
+- **Incomplete** — missing ranks or containing "holes"
+- **Incorrect** — spelling errors, nonstandard terms, or outdated classifications
+- **Ambiguous** — homonyms, synonyms, and terms with multiple interpretations
+
+Taxonomic authorities exist to standardize classification, but:
+
+- There are multiple authorities  
+- They may disagree  
+- A given organism may be missing from some  
+
+## Solution
+TaxonoPy uses the the taxonomic lineages provided by diverse sources to submit batched queries to GNVerifier and resolve to a standardized classification path for each sample in the dataset. It is currently configured to prioritize alignment to the [GBIF Backbone Taxonomy](https://verifier.globalnames.org/data_sources/11). Where GBIF misses, backup sources of the [Catalogue of Life](https://verifier.globalnames.org/data_sources/1) and [Open Tree of Life (OTOL) Reference Taxonomy](https://verifier.globalnames.org/data_sources/179) are used.
+
+## Getting Started
+To get started with TaxonoPy, see the [Quick Reference](user-guide/quick-reference.md) guide.
+
+---
+
+!!! warning
+    Taxonomic classifications are human-constructed models of biological diversity, not direct representations of biological reality.
+    Names and ranks reflect taxonomic concepts that may vary between authorities, evolve over time, and differ in scope or interpretation.
+
+    TaxonoPy aims to produce a **consistent, transparent, and fit-for-purpose classification** suitable for large-scale data integration and AI workflows.
+    It prioritizes internal coherence and interoperability across datasets and providers by aligning source data to a selected reference taxonomy.
+
+    It is a progressive effort to improve taxonomic alignment in an evolving landscape.
+    If you have suggestions or encounter bugs, please see the [Contributing](development/contributing/index.md) page.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Acknowledgments

		The [Imageomics Institute](https://imageomics.org/) is supported by the National Science Foundation under [Award No. 2118240](https://www.nsf.gov/awardsearch/showAward?AWD_ID=2118240) "HDR Institute: Imageomics: A New Frontier of Biological Information Powered by Knowledge-Guided Machine Learning." Any opinions, findings and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation.