diff --git a/docs/tutorials/DTO_analysis.ipynb b/docs/tutorials/DTO_analysis.ipynb
new file mode 100644
index 0000000..4c11286
--- /dev/null
+++ b/docs/tutorials/DTO_analysis.ipynb
@@ -0,0 +1,968 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# DTO Analysis: Significance Filtering of Cross-Dataset Binding Samples\n",
+    "\n",
+    "This notebook is used to analyze the correlation between transcription factor (TF) binding data and perturbation data.\n",
+    "\n",
+    "## Analysis Objectives\n",
+    "\n",
+    "1. Select all binding samples with DTO P<=0.01 compared to **Hackett-2020-ZEV**.\n",
+    "2. Select all binding samples with DTO P<=0.01 compared to **Kemmeren-2014-TFKO**.\n",
+    "3. Find the intersection of the two sets above.\n",
+    "4. For each regulator in the active set, count the number of active samples.\n",
+    "\n",
+    "## Challenges and Additional Analysis\n",
+    "\n",
+    "- Explore the time point effects in the Hackett data.\n",
+    "- Analyze the impact of different time points on the DTO distribution.\n",
+    "- Select the optimal conditions (e.g., ZEV vs GEV) for each regulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import necessary libraries\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from collections import Counter\n",
+    "\n",
+    "# Set display options\n",
+    "pd.set_option('display.max_columns', None)\n",
+    "pd.set_option('display.max_rows', 100)\n",
+    "pd.set_option('display.width', None)\n",
+    "\n",
+    "# Set plot style\n",
+    "plt.style.use('seaborn-v0_8-whitegrid')\n",
+    "sns.set_palette('husl')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u2705 Configuration file saved at: /tmp/tmpv37ibelm/vdb_config.yaml\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create VirtualDB configuration\n",
+    "# This configuration defines how to map the fields of different datasets and how to associate DTO comparative analysis data\n",
+    "\n",
+    "import tempfile\n",
+    "from pathlib import Path\n",
+    "\n",
+    "config_yaml = \"\"\"\n",
+    "repositories:\n",
+    "  BrentLab/harbison_2004:\n",
+    "    dataset:\n",
+    "      harbison_2004:\n",
+    "        sample_id:\n",
+    "          field: sample_id\n",
+    "        carbon_source:\n",
+    "          field: condition\n",
+    "          path: media.carbon_source.compound\n",
+    "        temperature_celsius:\n",
+    "          field: condition\n",
+    "          path: temperature_celsius\n",
+    "          dtype: numeric\n",
+    "        environmental_condition:\n",
+    "          field: condition\n",
+    "        regualtor_locus_tag:\n",
+    "          field: regulator_locus_tag\n",
+    "        regulator_symbol:\n",
+    "          field: regulator_symbol\n",
+    "\n",
+    "        comparative_analyses:\n",
+    "          - repo: BrentLab/yeast_comparative_analysis\n",
+    "            dataset: dto\n",
+    "            via_field: binding_id\n",
+    "\n",
+    "  BrentLab/rossi_2021:\n",
+    "    carbon_source:                \n",
+    "      path: media.carbon_source.compound\n",
+    "    temperature_celsius:          \n",
+    "      path: temperature_celsius\n",
+    "    dataset:\n",
+    "      rossi_2021_af_combined:\n",
+    "        sample_id:                \n",
+    "          field: sample_id\n",
+    "        regulator_locus_tag:\n",
+    "          field: regulator_locus_tag\n",
+    "        target_locus_tag:\n",
+    "          field: target_locus_tag\n",
+    "\n",
+    "        comparative_analyses:\n",
+    "          - repo: BrentLab/yeast_comparative_analysis\n",
+    "            dataset: dto\n",
+    "            via_field: binding_id\n",
+    "\n",
+    "  BrentLab/mahendrawada_2025:\n",
+    "    dataset:\n",
+    "      reprocessed_diffcontrol_5prime:\n",
+    "        sample_id:\n",
+    "          field: sample_id\n",
+    "        control_source:\n",
+    "          field: control_source\n",
+    "        regulator_locus_tag:\n",
+    "          field: regulator_locus_tag\n",
+    "        regulator_symbol:\n",
+    "          field: regulator_symbol\n",
+    "        environmental_condition:\n",
+    "          field: condition\n",
+    "        temperature_celsius:\n",
+    "          field: condition\n",
+    "          path: temperature_celsius\n",
+    "          dtype: numeric\n",
+    "        media_name:\n",
+    "          field: condition\n",
+    "          path: media.name\n",
+    "        carbon_source:\n",
+    "          field: condition\n",
+    "          path: media.carbon_source\n",
+    "\n",
+    "        comparative_analyses:\n",
+    "          - repo: BrentLab/yeast_comparative_analysis\n",
+    "            dataset: dto\n",
+    "            via_field: binding_id\n",
+    "\n",
+    "\n",
+    "  BrentLab/callingcards:\n",
+    "    carbon_source:                \n",
+    "      path: media.carbon_source.compound\n",
+    "    temperature_celsius:          \n",
+    "      path: temperature_celsius\n",
+    "    dataset:\n",
+    "      annotated_features:\n",
+    "        id:\n",
+    "          field: id\n",
+    "        regulator_locus_tag:\n",
+    "          field: target_locus_tag\n",
+    "        regulator_symbol:\n",
+    "          field: target_symbol\n",
+    "        \n",
+    "        comparative_analyses:\n",
+    "          - repo: BrentLab/yeast_comparative_analysis\n",
+    "            dataset: dto\n",
+    "            via_field: binding_id\n",
+    "  \n",
+    "  BrentLab/hackett_2020:\n",
+    "    dataset:\n",
+    "      hackett_2020:\n",
+    "        sample_id:\n",
+    "          field: sample_id\n",
+    "          dtype: numeric\n",
+    "        regulator_locus_tag:\n",
+    "          field: regulator_locus_tag\n",
+    "        temperature_celsius:\n",
+    "          path: temperature_celsius\n",
+    "          dtype: numeric\n",
+    "        cultivation_method:\n",
+    "          path: cultivation_method\n",
+    "        media_name:\n",
+    "          path: media.name\n",
+    "        induction_system:\n",
+    "          field: mechanism\n",
+    "        inducer_compound:\n",
+    "          field: mechanism\n",
+    "          path: definitions.inducer\n",
+    "        nutrient_restriction:\n",
+    "          field: restriction\n",
+    "        log2fc:\n",
+    "          field: log2_shrunken_timecourses\n",
+    "          dtype: numeric\n",
+    "        log2_raw_ratio:\n",
+    "          field: log2_ratio\n",
+    "          dtype: numeric\n",
+    "        time_point:\n",
+    "          field: time\n",
+    "          dtype: numeric\n",
+    "\n",
+    "        comparative_analyses:\n",
+    "          - repo: BrentLab/yeast_comparative_analysis\n",
+    "            dataset: dto\n",
+    "            via_field: perturbation_id\n",
+    "\n",
+    "  BrentLab/kemmeren_2014:\n",
+    "    dataset:\n",
+    "      kemmeren_2014:\n",
+    "        sample_id:\n",
+    "          field: sample_id\n",
+    "        carbon_source:\n",
+    "          path: media.carbon_source.compound\n",
+    "        temperature_celsius:\n",
+    "          path: temperature_celsius\n",
+    "          dtype: numeric\n",
+    "\n",
+    "        comparative_analyses:\n",
+    "          - repo: BrentLab/yeast_comparative_analysis\n",
+    "            dataset: dto\n",
+    "            via_field: perturbation_id\n",
+    "\n",
+    "  BrentLab/yeast_comparative_analysis:\n",
+    "    dataset:\n",
+    "      dto:\n",
+    "        binding_id:\n",
+    "          field: binding_id\n",
+    "        perturbation_id:\n",
+    "          field: perturbation_id\n",
+    "        fdr:\n",
+    "          field: dto_fdr\n",
+    "          dtype: numeric\n",
+    "        pvalue:\n",
+    "          field: dto_empirical_pvalue\n",
+    "          dtype: numeric\n",
+    "        binding_threshold:\n",
+    "          field: binding_rank_threshold\n",
+    "          dtype: numeric\n",
+    "        perturbation_threshold:\n",
+    "          field: perturbation_rank_threshold\n",
+    "          dtype: numeric\n",
+    "        binding_set_size:\n",
+    "          field: binding_set_size\n",
+    "          dtype: numeric\n",
+    "        perturbation_set_size:\n",
+    "          field: perturbation_set_size\n",
+    "          dtype: numeric\n",
+    "\n",
+    "factor_aliases:\n",
+    "  carbon_source:\n",
+    "    glucose: [D-glucose, dextrose, glu]\n",
+    "    galactose: [D-galactose, gal]\n",
+    "    raffinose: [D-raffinose]\n",
+    "\n",
+    "missing_value_labels:\n",
+    "  carbon_source: \"unspecified\"\n",
+    "\n",
+    "description:\n",
+    "  carbon_source: The carbon source provided during growth\n",
+    "  temperature_celsius: Growth temperature in degrees Celsius\n",
+    "  environmental_condition: Named environmental condition\n",
+    "\"\"\"\n",
+    "\n",
+    "# Save the configuration to a temporary file\n",
+    "temp_config = Path(tempfile.mkdtemp()) / \"vdb_config.yaml\"\n",
+    "temp_config.write_text(config_yaml)\n",
+    "\n",
+    "print(f\"\u2705 Configuration file saved at: {temp_config}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u2705 VirtualDB initialized successfully!\n",
+      "Number of configured repositories: 7\n",
+      "\n",
+      "Configured datasets:\n",
+      "  - BrentLab/harbison_2004/harbison_2004\n",
+      "  - BrentLab/rossi_2021/rossi_2021_af_combined\n",
+      "  - BrentLab/mahendrawada_2025/reprocessed_diffcontrol_5prime\n",
+      "  - BrentLab/callingcards/annotated_features\n",
+      "  - BrentLab/hackett_2020/hackett_2020\n",
+      "  - BrentLab/kemmeren_2014/kemmeren_2014\n",
+      "  - BrentLab/yeast_comparative_analysis/dto\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initialize VirtualDB\n",
+    "from tfbpapi.virtual_db import VirtualDB\n",
+    "\n",
+    "# Token authentication required\n",
+    "hf_token = \"\"\n",
+    "\n",
+    "vdb = VirtualDB(str(temp_config), token=hf_token)\n",
+    "\n",
+    "print(\"\u2705 VirtualDB initialized successfully!\")\n",
+    "print(f\"Number of configured repositories: {len(vdb.config.repositories)}\")\n",
+    "\n",
+    "# List all configured datasets\n",
+    "print(\"\\nConfigured datasets:\")\n",
+    "for repo_id, repo_config in vdb.config.repositories.items():\n",
+    "    if repo_config.dataset:\n",
+    "        for config_name in repo_config.dataset.keys():\n",
+    "            print(f\"  - {repo_id}/{config_name}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Added the ability to perform comparative analysis to the query."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Fetching 6 files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00<00:00, 51569.31it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sample_id</th>\n",
+       "      <th>carbon_source</th>\n",
+       "      <th>temperature_celsius</th>\n",
+       "      <th>pvalue</th>\n",
+       "      <th>perturbation_id</th>\n",
+       "      <th>dataset_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.301</td>\n",
+       "      <td>BrentLab/kemmeren_2014;kemmeren_2014;18</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;33</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.512</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;34</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.306</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;40</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;37</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.309</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;38</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.644</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;36</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.411</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;35</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.536</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;39</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   sample_id carbon_source temperature_celsius  pvalue  \\\n",
+       "0          1       glucose                  30     NaN   \n",
+       "1          2       glucose                  30   0.301   \n",
+       "2          2       glucose                  30     NaN   \n",
+       "3          2       glucose                  30   0.512   \n",
+       "4          2       glucose                  30   0.306   \n",
+       "5          2       glucose                  30     NaN   \n",
+       "6          2       glucose                  30   0.309   \n",
+       "7          2       glucose                  30   0.644   \n",
+       "8          2       glucose                  30   0.411   \n",
+       "9          2       glucose                  30   0.536   \n",
+       "\n",
+       "                           perturbation_id  \\\n",
+       "0                                      NaN   \n",
+       "1  BrentLab/kemmeren_2014;kemmeren_2014;18   \n",
+       "2    BrentLab/Hackett_2020;hackett_2020;33   \n",
+       "3    BrentLab/Hackett_2020;hackett_2020;34   \n",
+       "4    BrentLab/Hackett_2020;hackett_2020;40   \n",
+       "5    BrentLab/Hackett_2020;hackett_2020;37   \n",
+       "6    BrentLab/Hackett_2020;hackett_2020;38   \n",
+       "7    BrentLab/Hackett_2020;hackett_2020;36   \n",
+       "8    BrentLab/Hackett_2020;hackett_2020;35   \n",
+       "9    BrentLab/Hackett_2020;hackett_2020;39   \n",
+       "\n",
+       "                             dataset_id  \n",
+       "0  BrentLab/harbison_2004/harbison_2004  \n",
+       "1  BrentLab/harbison_2004/harbison_2004  \n",
+       "2  BrentLab/harbison_2004/harbison_2004  \n",
+       "3  BrentLab/harbison_2004/harbison_2004  \n",
+       "4  BrentLab/harbison_2004/harbison_2004  \n",
+       "5  BrentLab/harbison_2004/harbison_2004  \n",
+       "6  BrentLab/harbison_2004/harbison_2004  \n",
+       "7  BrentLab/harbison_2004/harbison_2004  \n",
+       "8  BrentLab/harbison_2004/harbison_2004  \n",
+       "9  BrentLab/harbison_2004/harbison_2004  "
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_p001 = vdb.query(\n",
+    "    datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n",
+    "    fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\", \"pvalue\", \"perturbation_id\"],\n",
+    ")\n",
+    "\n",
+    "all_p001.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Fetching 6 files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00<00:00, 49152.00it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sample_id</th>\n",
+       "      <th>temperature_celsius</th>\n",
+       "      <th>perturbation_id</th>\n",
+       "      <th>binding_id</th>\n",
+       "      <th>pvalue</th>\n",
+       "      <th>dataset_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>34</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;34</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;394</td>\n",
+       "      <td>0.010</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>38</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;38</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;380</td>\n",
+       "      <td>0.003</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>39</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;39</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;380</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>39</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;39</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;748</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>40</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;40</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;380</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>40</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;40</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;394</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>40</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;40</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;748</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>44</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;44</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;34</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>44</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;44</td>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;7</td>\n",
+       "      <td>0.009</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>44</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;44</td>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;8</td>\n",
+       "      <td>0.007</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   sample_id temperature_celsius                        perturbation_id  \\\n",
+       "0         34                  30  BrentLab/Hackett_2020;hackett_2020;34   \n",
+       "1         38                  30  BrentLab/Hackett_2020;hackett_2020;38   \n",
+       "2         39                  30  BrentLab/Hackett_2020;hackett_2020;39   \n",
+       "3         39                  30  BrentLab/Hackett_2020;hackett_2020;39   \n",
+       "4         40                  30  BrentLab/Hackett_2020;hackett_2020;40   \n",
+       "5         40                  30  BrentLab/Hackett_2020;hackett_2020;40   \n",
+       "6         40                  30  BrentLab/Hackett_2020;hackett_2020;40   \n",
+       "7         44                  30  BrentLab/Hackett_2020;hackett_2020;44   \n",
+       "8         44                  30  BrentLab/Hackett_2020;hackett_2020;44   \n",
+       "9         44                  30  BrentLab/Hackett_2020;hackett_2020;44   \n",
+       "\n",
+       "                                     binding_id  pvalue  \\\n",
+       "0  BrentLab/callingcards;annotated_features;394   0.010   \n",
+       "1  BrentLab/callingcards;annotated_features;380   0.003   \n",
+       "2  BrentLab/callingcards;annotated_features;380   0.000   \n",
+       "3  BrentLab/callingcards;annotated_features;748   0.000   \n",
+       "4  BrentLab/callingcards;annotated_features;380   0.000   \n",
+       "5  BrentLab/callingcards;annotated_features;394   0.000   \n",
+       "6  BrentLab/callingcards;annotated_features;748   0.000   \n",
+       "7   BrentLab/callingcards;annotated_features;34   0.000   \n",
+       "8        BrentLab/harbison_2004;harbison_2004;7   0.009   \n",
+       "9        BrentLab/harbison_2004;harbison_2004;8   0.007   \n",
+       "\n",
+       "                           dataset_id  \n",
+       "0  BrentLab/hackett_2020/hackett_2020  \n",
+       "1  BrentLab/hackett_2020/hackett_2020  \n",
+       "2  BrentLab/hackett_2020/hackett_2020  \n",
+       "3  BrentLab/hackett_2020/hackett_2020  \n",
+       "4  BrentLab/hackett_2020/hackett_2020  \n",
+       "5  BrentLab/hackett_2020/hackett_2020  \n",
+       "6  BrentLab/hackett_2020/hackett_2020  \n",
+       "7  BrentLab/hackett_2020/hackett_2020  \n",
+       "8  BrentLab/hackett_2020/hackett_2020  \n",
+       "9  BrentLab/hackett_2020/hackett_2020  "
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Even if not specified in the `fields` parameter, the filter options will still be retained within the `fields` parameter.\n",
+    "all_p001 = vdb.query(\n",
+    "    datasets=[(\"BrentLab/hackett_2020\", \"hackett_2020\")],\n",
+    "    filters={\n",
+    "        \"pvalue\": (\"<=\", 0.01) \n",
+    "    },\n",
+    "    fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\", \"perturbation_id\",\"binding_id\"],\n",
+    ")\n",
+    "\n",
+    "all_p001.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a function called query_dto that is specifically responsible for retrieving the DTO data for the specified binding and perturbation datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 126 DTO records\n",
+      "Column names: ['binding_id', 'perturbation_id', 'pvalue', 'fdr', 'sample_id', 'carbon_source', 'temperature_celsius']\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>binding_id</th>\n",
+       "      <th>perturbation_id</th>\n",
+       "      <th>pvalue</th>\n",
+       "      <th>fdr</th>\n",
+       "      <th>sample_id</th>\n",
+       "      <th>carbon_source</th>\n",
+       "      <th>temperature_celsius</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;3</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;85</td>\n",
+       "      <td>0.004</td>\n",
+       "      <td>0.000225</td>\n",
+       "      <td>3</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;3</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;87</td>\n",
+       "      <td>0.010</td>\n",
+       "      <td>0.000225</td>\n",
+       "      <td>3</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;3</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;82</td>\n",
+       "      <td>0.005</td>\n",
+       "      <td>0.000225</td>\n",
+       "      <td>3</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;7</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;44</td>\n",
+       "      <td>0.009</td>\n",
+       "      <td>0.022495</td>\n",
+       "      <td>7</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>59</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;8</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;44</td>\n",
+       "      <td>0.007</td>\n",
+       "      <td>0.080057</td>\n",
+       "      <td>8</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>61</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;8</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;46</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.062511</td>\n",
+       "      <td>8</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>66</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;9</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;44</td>\n",
+       "      <td>0.006</td>\n",
+       "      <td>0.110684</td>\n",
+       "      <td>9</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>68</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;9</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;48</td>\n",
+       "      <td>0.004</td>\n",
+       "      <td>0.317610</td>\n",
+       "      <td>9</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;9</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;46</td>\n",
+       "      <td>0.001</td>\n",
+       "      <td>0.117687</td>\n",
+       "      <td>9</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>76</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;10</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;46</td>\n",
+       "      <td>0.003</td>\n",
+       "      <td>0.099272</td>\n",
+       "      <td>10</td>\n",
+       "      <td>unspecified</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                 binding_id  \\\n",
+       "12   BrentLab/harbison_2004;harbison_2004;3   \n",
+       "17   BrentLab/harbison_2004;harbison_2004;3   \n",
+       "18   BrentLab/harbison_2004;harbison_2004;3   \n",
+       "50   BrentLab/harbison_2004;harbison_2004;7   \n",
+       "59   BrentLab/harbison_2004;harbison_2004;8   \n",
+       "61   BrentLab/harbison_2004;harbison_2004;8   \n",
+       "66   BrentLab/harbison_2004;harbison_2004;9   \n",
+       "68   BrentLab/harbison_2004;harbison_2004;9   \n",
+       "71   BrentLab/harbison_2004;harbison_2004;9   \n",
+       "76  BrentLab/harbison_2004;harbison_2004;10   \n",
+       "\n",
+       "                          perturbation_id  pvalue       fdr  sample_id  \\\n",
+       "12  BrentLab/Hackett_2020;hackett_2020;85   0.004  0.000225          3   \n",
+       "17  BrentLab/Hackett_2020;hackett_2020;87   0.010  0.000225          3   \n",
+       "18  BrentLab/Hackett_2020;hackett_2020;82   0.005  0.000225          3   \n",
+       "50  BrentLab/Hackett_2020;hackett_2020;44   0.009  0.022495          7   \n",
+       "59  BrentLab/Hackett_2020;hackett_2020;44   0.007  0.080057          8   \n",
+       "61  BrentLab/Hackett_2020;hackett_2020;46   0.000  0.062511          8   \n",
+       "66  BrentLab/Hackett_2020;hackett_2020;44   0.006  0.110684          9   \n",
+       "68  BrentLab/Hackett_2020;hackett_2020;48   0.004  0.317610          9   \n",
+       "71  BrentLab/Hackett_2020;hackett_2020;46   0.001  0.117687          9   \n",
+       "76  BrentLab/Hackett_2020;hackett_2020;46   0.003  0.099272         10   \n",
+       "\n",
+       "   carbon_source temperature_celsius  \n",
+       "12       glucose                  30  \n",
+       "17       glucose                  30  \n",
+       "18       glucose                  30  \n",
+       "50       glucose                  30  \n",
+       "59       glucose                  30  \n",
+       "61       glucose                  30  \n",
+       "66       glucose                  30  \n",
+       "68       glucose                  30  \n",
+       "71       glucose                  30  \n",
+       "76   unspecified                  30  "
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Example: Query the intersection of harbison and hackett, filter for pvalue <= 0.01\n",
+    "dto_result = vdb.query_dto(\n",
+    "    binding_dataset=(\"BrentLab/harbison_2004\", \"harbison_2004\"),\n",
+    "    perturbation_dataset=(\"BrentLab/hackett_2020\", \"hackett_2020\"),\n",
+    "    dto_filters={\"pvalue\": (\"<=\", 0.01)},\n",
+    "    fields=[\"binding_id\", \"perturbation_id\", \"pvalue\", \"fdr\",\"sample_id\", \"carbon_source\", \"temperature_celsius\"]\n",
+    ")\n",
+    "\n",
+    "print(f\"Found {len(dto_result)} DTO records\")\n",
+    "print(f\"Column names: {list(dto_result.columns)}\")\n",
+    "dto_result.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fetching 135 files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 135/135 [00:00<00:00, 18401.45it/s]\n",
+      "Query execution failed: Binder Error: Referenced column \"sample_id\" not found in FROM clause!\n",
+      "Candidate bindings: \"callingcards_enrichment\", \"target_symbol\"\n",
+      "\n",
+      "LINE 1: SELECT sample_id FROM metadata_annotated_features LIMIT 1\n",
+      "               ^\n",
+      "SQL: SELECT sample_id FROM metadata_annotated_features LIMIT 1\n"
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": []
+    }
+   ],
+   "source": [
+    "all_p001 = vdb.query(   \n",
+    "    datasets=[(\"BrentLab/callingcards\", \"annotated_features\")],\n",
+    "    complete=False\n",
+    ")\n",
+    "all_p001.head()\n",
+    "print(f\"\u603b\u5171\u6709 {len(all_p001)} \u884c\u6570\u636e\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "query in binding or pert,not compara, use vdb function"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/docs/tutorials/show_new_functions.ipynb b/docs/tutorials/show_new_functions.ipynb
new file mode 100644
index 0000000..efd8da3
--- /dev/null
+++ b/docs/tutorials/show_new_functions.ipynb
@@ -0,0 +1,960 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b5f1facc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import necessary libraries\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from collections import Counter\n",
+    "\n",
+    "# Set display options\n",
+    "pd.set_option('display.max_columns', None)\n",
+    "pd.set_option('display.max_rows', 100)\n",
+    "pd.set_option('display.width', None)\n",
+    "\n",
+    "# Set plot style\n",
+    "plt.style.use('seaborn-v0_8-whitegrid')\n",
+    "sns.set_palette('husl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "5452d8e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Configuration file saved at: /tmp/tmp9lavjul7/vdb_config.yaml\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create VirtualDB configuration\n",
+    "# This configuration defines how to map the fields of different datasets and how to associate DTO comparative analysis data\n",
+    "\n",
+    "import tempfile\n",
+    "from pathlib import Path\n",
+    "\n",
+    "config_yaml = \"\"\"\n",
+    "repositories:\n",
+    "  BrentLab/harbison_2004:\n",
+    "    dataset:\n",
+    "      harbison_2004:\n",
+    "        sample_id:\n",
+    "          field: sample_id\n",
+    "        carbon_source:\n",
+    "          field: condition\n",
+    "          path: media.carbon_source.compound\n",
+    "        temperature_celsius:\n",
+    "          field: condition\n",
+    "          path: temperature_celsius\n",
+    "          dtype: numeric\n",
+    "        environmental_condition:\n",
+    "          field: condition\n",
+    "        regualtor_locus_tag:\n",
+    "          field: regulator_locus_tag\n",
+    "        regulator_symbol:\n",
+    "          field: regulator_symbol\n",
+    "\n",
+    "        comparative_analyses:\n",
+    "          - repo: BrentLab/yeast_comparative_analysis\n",
+    "            dataset: dto\n",
+    "            via_field: binding_id\n",
+    "\n",
+    "  BrentLab/rossi_2021:\n",
+    "    carbon_source:                \n",
+    "      path: media.carbon_source.compound\n",
+    "    temperature_celsius:          \n",
+    "      path: temperature_celsius\n",
+    "    dataset:\n",
+    "      rossi_2021_af_combined:\n",
+    "        sample_id:                \n",
+    "          field: sample_id\n",
+    "        regulator_locus_tag:\n",
+    "          field: regulator_locus_tag\n",
+    "        target_locus_tag:\n",
+    "          field: target_locus_tag\n",
+    "\n",
+    "        comparative_analyses:\n",
+    "          - repo: BrentLab/yeast_comparative_analysis\n",
+    "            dataset: dto\n",
+    "            via_field: binding_id\n",
+    "\n",
+    "  BrentLab/mahendrawada_2025:\n",
+    "    dataset:\n",
+    "      reprocessed_diffcontrol_5prime:\n",
+    "        sample_id:\n",
+    "          field: sample_id\n",
+    "        control_source:\n",
+    "          field: control_source\n",
+    "        regulator_locus_tag:\n",
+    "          field: regulator_locus_tag\n",
+    "        regulator_symbol:\n",
+    "          field: regulator_symbol\n",
+    "        environmental_condition:\n",
+    "          field: condition\n",
+    "        temperature_celsius:\n",
+    "          field: condition\n",
+    "          path: temperature_celsius\n",
+    "          dtype: numeric\n",
+    "        media_name:\n",
+    "          field: condition\n",
+    "          path: media.name\n",
+    "        carbon_source:\n",
+    "          field: condition\n",
+    "          path: media.carbon_source\n",
+    "\n",
+    "        comparative_analyses:\n",
+    "          - repo: BrentLab/yeast_comparative_analysis\n",
+    "            dataset: dto\n",
+    "            via_field: binding_id\n",
+    "\n",
+    "\n",
+    "  BrentLab/callingcards:\n",
+    "    carbon_source:                \n",
+    "      path: media.carbon_source.compound\n",
+    "    temperature_celsius:          \n",
+    "      path: temperature_celsius\n",
+    "    dataset:\n",
+    "      annotated_features:\n",
+    "        id:\n",
+    "          field: id\n",
+    "        regulator_locus_tag:\n",
+    "          field: target_locus_tag\n",
+    "        regulator_symbol:\n",
+    "          field: target_symbol\n",
+    "        \n",
+    "        comparative_analyses:\n",
+    "          - repo: BrentLab/yeast_comparative_analysis\n",
+    "            dataset: dto\n",
+    "            via_field: binding_id\n",
+    "  \n",
+    "  BrentLab/hackett_2020:\n",
+    "    dataset:\n",
+    "      hackett_2020:\n",
+    "        sample_id:\n",
+    "          field: sample_id\n",
+    "          dtype: numeric\n",
+    "        regulator_locus_tag:\n",
+    "          field: regulator_locus_tag\n",
+    "        temperature_celsius:\n",
+    "          path: temperature_celsius\n",
+    "          dtype: numeric\n",
+    "        cultivation_method:\n",
+    "          path: cultivation_method\n",
+    "        media_name:\n",
+    "          path: media.name\n",
+    "        induction_system:\n",
+    "          field: mechanism\n",
+    "        inducer_compound:\n",
+    "          field: mechanism\n",
+    "          path: definitions.inducer\n",
+    "        nutrient_restriction:\n",
+    "          field: restriction\n",
+    "        log2fc:\n",
+    "          field: log2_shrunken_timecourses\n",
+    "          dtype: numeric\n",
+    "        log2_raw_ratio:\n",
+    "          field: log2_ratio\n",
+    "          dtype: numeric\n",
+    "        time_point:\n",
+    "          field: time\n",
+    "          dtype: numeric\n",
+    "\n",
+    "        comparative_analyses:\n",
+    "          - repo: BrentLab/yeast_comparative_analysis\n",
+    "            dataset: dto\n",
+    "            via_field: perturbation_id\n",
+    "\n",
+    "  BrentLab/kemmeren_2014:\n",
+    "    dataset:\n",
+    "      kemmeren_2014:\n",
+    "        sample_id:\n",
+    "          field: sample_id\n",
+    "        carbon_source:\n",
+    "          path: media.carbon_source.compound\n",
+    "        temperature_celsius:\n",
+    "          path: temperature_celsius\n",
+    "          dtype: numeric\n",
+    "\n",
+    "        comparative_analyses:\n",
+    "          - repo: BrentLab/yeast_comparative_analysis\n",
+    "            dataset: dto\n",
+    "            via_field: perturbation_id\n",
+    "\n",
+    "  BrentLab/yeast_comparative_analysis:\n",
+    "    dataset:\n",
+    "      dto:\n",
+    "        binding_id:\n",
+    "          field: binding_id\n",
+    "        perturbation_id:\n",
+    "          field: perturbation_id\n",
+    "        fdr:\n",
+    "          field: dto_fdr\n",
+    "          dtype: numeric\n",
+    "        pvalue:\n",
+    "          field: dto_empirical_pvalue\n",
+    "          dtype: numeric\n",
+    "        binding_threshold:\n",
+    "          field: binding_rank_threshold\n",
+    "          dtype: numeric\n",
+    "        perturbation_threshold:\n",
+    "          field: perturbation_rank_threshold\n",
+    "          dtype: numeric\n",
+    "        binding_set_size:\n",
+    "          field: binding_set_size\n",
+    "          dtype: numeric\n",
+    "        perturbation_set_size:\n",
+    "          field: perturbation_set_size\n",
+    "          dtype: numeric\n",
+    "\n",
+    "factor_aliases:\n",
+    "  carbon_source:\n",
+    "    glucose: [D-glucose, dextrose, glu]\n",
+    "    galactose: [D-galactose, gal]\n",
+    "    raffinose: [D-raffinose]\n",
+    "\n",
+    "missing_value_labels:\n",
+    "  carbon_source: \"unspecified\"\n",
+    "\n",
+    "description:\n",
+    "  carbon_source: The carbon source provided during growth\n",
+    "  temperature_celsius: Growth temperature in degrees Celsius\n",
+    "  environmental_condition: Named environmental condition\n",
+    "\"\"\"\n",
+    "\n",
+    "# Save the configuration to a temporary file\n",
+    "temp_config = Path(tempfile.mkdtemp()) / \"vdb_config.yaml\"\n",
+    "temp_config.write_text(config_yaml)\n",
+    "\n",
+    "print(f\"✅ Configuration file saved at: {temp_config}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1550d737",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ VirtualDB initialized successfully!\n",
+      "Number of configured repositories: 7\n",
+      "\n",
+      "Configured datasets:\n",
+      "  - BrentLab/harbison_2004/harbison_2004\n",
+      "  - BrentLab/rossi_2021/rossi_2021_af_combined\n",
+      "  - BrentLab/mahendrawada_2025/reprocessed_diffcontrol_5prime\n",
+      "  - BrentLab/callingcards/annotated_features\n",
+      "  - BrentLab/hackett_2020/hackett_2020\n",
+      "  - BrentLab/kemmeren_2014/kemmeren_2014\n",
+      "  - BrentLab/yeast_comparative_analysis/dto\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initialize VirtualDB\n",
+    "import os\n",
+    "from tfbpapi.virtual_db import VirtualDB\n",
+    "\n",
+    "# Token authentication required\n",
+    "hf_token = os.getenv(\"HF_TOKEN\", None)\n",
+    "\n",
+    "vdb = VirtualDB(str(temp_config), token=hf_token)\n",
+    "\n",
+    "print(\"✅ VirtualDB initialized successfully!\")\n",
+    "print(f\"Number of configured repositories: {len(vdb.config.repositories)}\")\n",
+    "\n",
+    "# List all configured datasets\n",
+    "print(\"\\nConfigured datasets:\")\n",
+    "for repo_id, repo_config in vdb.config.repositories.items():\n",
+    "    if repo_config.dataset:\n",
+    "        for config_name in repo_config.dataset.keys():\n",
+    "            print(f\"  - {repo_id}/{config_name}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6dc4ce8",
+   "metadata": {},
+   "source": [
+    "Added the ability to perform comparative analysis to the query."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "c9b1b241",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 41665.27it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sample_id</th>\n",
+       "      <th>carbon_source</th>\n",
+       "      <th>temperature_celsius</th>\n",
+       "      <th>pvalue</th>\n",
+       "      <th>perturbation_id</th>\n",
+       "      <th>dataset_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.301</td>\n",
+       "      <td>BrentLab/kemmeren_2014;kemmeren_2014;18</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;33</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.512</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;34</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.306</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;40</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;37</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.309</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;38</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.644</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;36</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.411</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;35</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>2</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.536</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;39</td>\n",
+       "      <td>BrentLab/harbison_2004/harbison_2004</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   sample_id carbon_source temperature_celsius  pvalue  \\\n",
+       "0          1       glucose                  30     NaN   \n",
+       "1          2       glucose                  30   0.301   \n",
+       "2          2       glucose                  30     NaN   \n",
+       "3          2       glucose                  30   0.512   \n",
+       "4          2       glucose                  30   0.306   \n",
+       "5          2       glucose                  30     NaN   \n",
+       "6          2       glucose                  30   0.309   \n",
+       "7          2       glucose                  30   0.644   \n",
+       "8          2       glucose                  30   0.411   \n",
+       "9          2       glucose                  30   0.536   \n",
+       "\n",
+       "                           perturbation_id  \\\n",
+       "0                                      NaN   \n",
+       "1  BrentLab/kemmeren_2014;kemmeren_2014;18   \n",
+       "2    BrentLab/Hackett_2020;hackett_2020;33   \n",
+       "3    BrentLab/Hackett_2020;hackett_2020;34   \n",
+       "4    BrentLab/Hackett_2020;hackett_2020;40   \n",
+       "5    BrentLab/Hackett_2020;hackett_2020;37   \n",
+       "6    BrentLab/Hackett_2020;hackett_2020;38   \n",
+       "7    BrentLab/Hackett_2020;hackett_2020;36   \n",
+       "8    BrentLab/Hackett_2020;hackett_2020;35   \n",
+       "9    BrentLab/Hackett_2020;hackett_2020;39   \n",
+       "\n",
+       "                             dataset_id  \n",
+       "0  BrentLab/harbison_2004/harbison_2004  \n",
+       "1  BrentLab/harbison_2004/harbison_2004  \n",
+       "2  BrentLab/harbison_2004/harbison_2004  \n",
+       "3  BrentLab/harbison_2004/harbison_2004  \n",
+       "4  BrentLab/harbison_2004/harbison_2004  \n",
+       "5  BrentLab/harbison_2004/harbison_2004  \n",
+       "6  BrentLab/harbison_2004/harbison_2004  \n",
+       "7  BrentLab/harbison_2004/harbison_2004  \n",
+       "8  BrentLab/harbison_2004/harbison_2004  \n",
+       "9  BrentLab/harbison_2004/harbison_2004  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_p001 = vdb.query(\n",
+    "    datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n",
+    "    fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\", \"pvalue\", \"perturbation_id\"],\n",
+    ")\n",
+    "\n",
+    "all_p001.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "f7f6a7f4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 37729.87it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sample_id</th>\n",
+       "      <th>temperature_celsius</th>\n",
+       "      <th>perturbation_id</th>\n",
+       "      <th>binding_id</th>\n",
+       "      <th>pvalue</th>\n",
+       "      <th>dataset_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>34</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;34</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;394</td>\n",
+       "      <td>0.010</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>38</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;38</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;380</td>\n",
+       "      <td>0.003</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>39</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;39</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;380</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>39</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;39</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;748</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>40</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;40</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;380</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>40</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;40</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;394</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>40</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;40</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;748</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>44</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;44</td>\n",
+       "      <td>BrentLab/callingcards;annotated_features;34</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>44</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;44</td>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;7</td>\n",
+       "      <td>0.009</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>44</td>\n",
+       "      <td>30</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;44</td>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;8</td>\n",
+       "      <td>0.007</td>\n",
+       "      <td>BrentLab/hackett_2020/hackett_2020</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   sample_id temperature_celsius                        perturbation_id  \\\n",
+       "0         34                  30  BrentLab/Hackett_2020;hackett_2020;34   \n",
+       "1         38                  30  BrentLab/Hackett_2020;hackett_2020;38   \n",
+       "2         39                  30  BrentLab/Hackett_2020;hackett_2020;39   \n",
+       "3         39                  30  BrentLab/Hackett_2020;hackett_2020;39   \n",
+       "4         40                  30  BrentLab/Hackett_2020;hackett_2020;40   \n",
+       "5         40                  30  BrentLab/Hackett_2020;hackett_2020;40   \n",
+       "6         40                  30  BrentLab/Hackett_2020;hackett_2020;40   \n",
+       "7         44                  30  BrentLab/Hackett_2020;hackett_2020;44   \n",
+       "8         44                  30  BrentLab/Hackett_2020;hackett_2020;44   \n",
+       "9         44                  30  BrentLab/Hackett_2020;hackett_2020;44   \n",
+       "\n",
+       "                                     binding_id  pvalue  \\\n",
+       "0  BrentLab/callingcards;annotated_features;394   0.010   \n",
+       "1  BrentLab/callingcards;annotated_features;380   0.003   \n",
+       "2  BrentLab/callingcards;annotated_features;380   0.000   \n",
+       "3  BrentLab/callingcards;annotated_features;748   0.000   \n",
+       "4  BrentLab/callingcards;annotated_features;380   0.000   \n",
+       "5  BrentLab/callingcards;annotated_features;394   0.000   \n",
+       "6  BrentLab/callingcards;annotated_features;748   0.000   \n",
+       "7   BrentLab/callingcards;annotated_features;34   0.000   \n",
+       "8        BrentLab/harbison_2004;harbison_2004;7   0.009   \n",
+       "9        BrentLab/harbison_2004;harbison_2004;8   0.007   \n",
+       "\n",
+       "                           dataset_id  \n",
+       "0  BrentLab/hackett_2020/hackett_2020  \n",
+       "1  BrentLab/hackett_2020/hackett_2020  \n",
+       "2  BrentLab/hackett_2020/hackett_2020  \n",
+       "3  BrentLab/hackett_2020/hackett_2020  \n",
+       "4  BrentLab/hackett_2020/hackett_2020  \n",
+       "5  BrentLab/hackett_2020/hackett_2020  \n",
+       "6  BrentLab/hackett_2020/hackett_2020  \n",
+       "7  BrentLab/hackett_2020/hackett_2020  \n",
+       "8  BrentLab/hackett_2020/hackett_2020  \n",
+       "9  BrentLab/hackett_2020/hackett_2020  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Even if not specified in the `fields` parameter, the filter options will still be retained within the `fields` parameter.\n",
+    "all_p001 = vdb.query(\n",
+    "    datasets=[(\"BrentLab/hackett_2020\", \"hackett_2020\")],\n",
+    "    filters={\n",
+    "        \"pvalue\": (\"<=\", 0.01) \n",
+    "    },\n",
+    "    fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\", \"perturbation_id\",\"binding_id\"],\n",
+    ")\n",
+    "\n",
+    "all_p001.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5bd97850",
+   "metadata": {},
+   "source": [
+    "Create a function called query_dto that is specifically responsible for retrieving the DTO data for the specified binding and perturbation datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "20864108",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 126 DTO records\n",
+      "Column names: ['binding_id', 'perturbation_id', 'pvalue', 'fdr', 'sample_id', 'carbon_source', 'temperature_celsius']\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>binding_id</th>\n",
+       "      <th>perturbation_id</th>\n",
+       "      <th>pvalue</th>\n",
+       "      <th>fdr</th>\n",
+       "      <th>sample_id</th>\n",
+       "      <th>carbon_source</th>\n",
+       "      <th>temperature_celsius</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;3</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;85</td>\n",
+       "      <td>0.004</td>\n",
+       "      <td>0.000225</td>\n",
+       "      <td>3</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;3</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;87</td>\n",
+       "      <td>0.010</td>\n",
+       "      <td>0.000225</td>\n",
+       "      <td>3</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;3</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;82</td>\n",
+       "      <td>0.005</td>\n",
+       "      <td>0.000225</td>\n",
+       "      <td>3</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;7</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;44</td>\n",
+       "      <td>0.009</td>\n",
+       "      <td>0.022495</td>\n",
+       "      <td>7</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>59</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;8</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;44</td>\n",
+       "      <td>0.007</td>\n",
+       "      <td>0.080057</td>\n",
+       "      <td>8</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>61</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;8</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;46</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.062511</td>\n",
+       "      <td>8</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>66</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;9</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;44</td>\n",
+       "      <td>0.006</td>\n",
+       "      <td>0.110684</td>\n",
+       "      <td>9</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>68</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;9</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;48</td>\n",
+       "      <td>0.004</td>\n",
+       "      <td>0.317610</td>\n",
+       "      <td>9</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;9</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;46</td>\n",
+       "      <td>0.001</td>\n",
+       "      <td>0.117687</td>\n",
+       "      <td>9</td>\n",
+       "      <td>glucose</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>76</th>\n",
+       "      <td>BrentLab/harbison_2004;harbison_2004;10</td>\n",
+       "      <td>BrentLab/Hackett_2020;hackett_2020;46</td>\n",
+       "      <td>0.003</td>\n",
+       "      <td>0.099272</td>\n",
+       "      <td>10</td>\n",
+       "      <td>unspecified</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                 binding_id  \\\n",
+       "12   BrentLab/harbison_2004;harbison_2004;3   \n",
+       "17   BrentLab/harbison_2004;harbison_2004;3   \n",
+       "18   BrentLab/harbison_2004;harbison_2004;3   \n",
+       "50   BrentLab/harbison_2004;harbison_2004;7   \n",
+       "59   BrentLab/harbison_2004;harbison_2004;8   \n",
+       "61   BrentLab/harbison_2004;harbison_2004;8   \n",
+       "66   BrentLab/harbison_2004;harbison_2004;9   \n",
+       "68   BrentLab/harbison_2004;harbison_2004;9   \n",
+       "71   BrentLab/harbison_2004;harbison_2004;9   \n",
+       "76  BrentLab/harbison_2004;harbison_2004;10   \n",
+       "\n",
+       "                          perturbation_id  pvalue       fdr  sample_id  \\\n",
+       "12  BrentLab/Hackett_2020;hackett_2020;85   0.004  0.000225          3   \n",
+       "17  BrentLab/Hackett_2020;hackett_2020;87   0.010  0.000225          3   \n",
+       "18  BrentLab/Hackett_2020;hackett_2020;82   0.005  0.000225          3   \n",
+       "50  BrentLab/Hackett_2020;hackett_2020;44   0.009  0.022495          7   \n",
+       "59  BrentLab/Hackett_2020;hackett_2020;44   0.007  0.080057          8   \n",
+       "61  BrentLab/Hackett_2020;hackett_2020;46   0.000  0.062511          8   \n",
+       "66  BrentLab/Hackett_2020;hackett_2020;44   0.006  0.110684          9   \n",
+       "68  BrentLab/Hackett_2020;hackett_2020;48   0.004  0.317610          9   \n",
+       "71  BrentLab/Hackett_2020;hackett_2020;46   0.001  0.117687          9   \n",
+       "76  BrentLab/Hackett_2020;hackett_2020;46   0.003  0.099272         10   \n",
+       "\n",
+       "   carbon_source temperature_celsius  \n",
+       "12       glucose                  30  \n",
+       "17       glucose                  30  \n",
+       "18       glucose                  30  \n",
+       "50       glucose                  30  \n",
+       "59       glucose                  30  \n",
+       "61       glucose                  30  \n",
+       "66       glucose                  30  \n",
+       "68       glucose                  30  \n",
+       "71       glucose                  30  \n",
+       "76   unspecified                  30  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Example: Query the intersection of harbison and hackett, filter for pvalue <= 0.01\n",
+    "dto_result = vdb.query_dto(\n",
+    "    binding_dataset=(\"BrentLab/harbison_2004\", \"harbison_2004\"),\n",
+    "    perturbation_dataset=(\"BrentLab/hackett_2020\", \"hackett_2020\"),\n",
+    "    dto_filters={\"pvalue\": (\"<=\", 0.01)},\n",
+    "    fields=[\"binding_id\", \"perturbation_id\", \"pvalue\", \"fdr\",\"sample_id\", \"carbon_source\", \"temperature_celsius\"]\n",
+    ")\n",
+    "\n",
+    "print(f\"Found {len(dto_result)} DTO records\")\n",
+    "print(f\"Column names: {list(dto_result.columns)}\")\n",
+    "dto_result.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "15f63f8a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Query execution failed: Binder Error: Referenced column \"sample_id\" not found in FROM clause!\n",
+      "Candidate bindings: \"callingcards_enrichment\", \"target_symbol\"\n",
+      "\n",
+      "LINE 1: SELECT sample_id FROM metadata_annotated_features LIMIT 1\n",
+      "               ^\n",
+      "SQL: SELECT sample_id FROM metadata_annotated_features LIMIT 1\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mKeyboardInterrupt\u001b[39m                         Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m all_p001 = \u001b[43mvdb\u001b[49m\u001b[43m.\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[43m   \u001b[49m\n\u001b[32m      2\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdatasets\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mBrentLab/callingcards\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mannotated_features\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m      3\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcomplete\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[32m      4\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m      5\u001b[39m all_p001.head()\n\u001b[32m      6\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mTotal number of rows: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(all_p001)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/tfbpapi/virtual_db.py:406\u001b[39m, in \u001b[36mVirtualDB.query\u001b[39m\u001b[34m(self, filters, datasets, fields, complete)\u001b[39m\n\u001b[32m    403\u001b[39m results: \u001b[38;5;28mlist\u001b[39m[pd.DataFrame] = []\n\u001b[32m    404\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m repo_id, config_name \u001b[38;5;129;01min\u001b[39;00m datasets:\n\u001b[32m    405\u001b[39m     \u001b[38;5;66;03m# Build metadata table\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m406\u001b[39m     metadata_df = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_build_metadata_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    407\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m metadata_df.empty:\n\u001b[32m    408\u001b[39m         \u001b[38;5;28;01mcontinue\u001b[39;00m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/tfbpapi/virtual_db.py:690\u001b[39m, in \u001b[36mVirtualDB._build_metadata_table\u001b[39m\u001b[34m(self, repo_id, config_name, use_cache)\u001b[39m\n\u001b[32m    688\u001b[39m \u001b[38;5;66;03m# If sample_id doesn't exist, generate from row number\u001b[39;00m\n\u001b[32m    689\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33msample_id\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m df.columns \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m df.empty:\n\u001b[32m--> \u001b[39m\u001b[32m690\u001b[39m     df[\u001b[33m\"\u001b[39m\u001b[33msample_id\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[43mdf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m.\u001b[49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m    692\u001b[39m \u001b[38;5;66;03m# One row per sample_id\u001b[39;00m\n\u001b[32m    693\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33msample_id\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m df.columns:\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/.venv/lib/python3.11/site-packages/pandas/core/indexes/base.py:1104\u001b[39m, in \u001b[36mIndex.astype\u001b[39m\u001b[34m(self, dtype, copy)\u001b[39m\n\u001b[32m   1100\u001b[39m     new_values = \u001b[38;5;28mcls\u001b[39m._from_sequence(\u001b[38;5;28mself\u001b[39m, dtype=dtype, copy=copy)\n\u001b[32m   1102\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m   1103\u001b[39m     \u001b[38;5;66;03m# GH#13149 specifically use astype_array instead of astype\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1104\u001b[39m     new_values = \u001b[43mastype_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1106\u001b[39m \u001b[38;5;66;03m# pass copy=False because any copying will be done in the astype above\u001b[39;00m\n\u001b[32m   1107\u001b[39m result = Index(new_values, name=\u001b[38;5;28mself\u001b[39m.name, dtype=new_values.dtype, copy=\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:182\u001b[39m, in \u001b[36mastype_array\u001b[39m\u001b[34m(values, dtype, copy)\u001b[39m\n\u001b[32m    179\u001b[39m     values = values.astype(dtype, copy=copy)\n\u001b[32m    181\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m182\u001b[39m     values = \u001b[43m_astype_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    184\u001b[39m \u001b[38;5;66;03m# in pandas we don't store numpy str dtypes, so convert to object\u001b[39;00m\n\u001b[32m    185\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dtype, np.dtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values.dtype.type, \u001b[38;5;28mstr\u001b[39m):\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:96\u001b[39m, in \u001b[36m_astype_nansafe\u001b[39m\u001b[34m(arr, dtype, copy, skipna)\u001b[39m\n\u001b[32m     94\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m arr.ndim > \u001b[32m1\u001b[39m:\n\u001b[32m     95\u001b[39m         arr = arr.ravel()\n\u001b[32m---> \u001b[39m\u001b[32m96\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[43m.\u001b[49m\u001b[43mensure_string_array\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m     97\u001b[39m \u001b[43m        \u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m=\u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert_na_value\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[32m     98\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m.reshape(shape)\n\u001b[32m    100\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m np.issubdtype(arr.dtype, np.floating) \u001b[38;5;129;01mand\u001b[39;00m dtype.kind \u001b[38;5;129;01min\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33miu\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m    101\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m _astype_float_to_int_nansafe(arr, dtype, copy)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/lib.pyx:718\u001b[39m, in \u001b[36mpandas._libs.lib.ensure_string_array\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/lib.pyx:832\u001b[39m, in \u001b[36mpandas._libs.lib.ensure_string_array\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[31mKeyboardInterrupt\u001b[39m: "
+     ]
+    }
+   ],
+   "source": [
+    "all_p001 = vdb.query(   \n",
+    "    datasets=[(\"BrentLab/callingcards\", \"annotated_features\")],\n",
+    "    complete=False\n",
+    ")\n",
+    "all_p001.head()\n",
+    "print(f\"Total number of rows: {len(all_p001)}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tfbpapi/tests/test_virtual_db.py b/tfbpapi/tests/test_virtual_db.py
index 1293bf9..f0abf1c 100644
--- a/tfbpapi/tests/test_virtual_db.py
+++ b/tfbpapi/tests/test_virtual_db.py
@@ -509,185 +509,6 @@ def test_parse_composite_identifier_invalid(self):
         with pytest.raises(ValueError, match="Invalid composite ID format"):
             VirtualDB._parse_composite_identifier("invalid:format")
 
-    def test_get_comparative_fields_for_dataset(self):
-        """Test getting comparative fields mapping."""
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
-            config = {
-                "repositories": {
-                    "BrentLab/primary": {
-                        "dataset": {
-                            "primary_data": {
-                                "sample_id": {"field": "sample_id"},
-                                "comparative_analyses": [
-                                    {
-                                        "repo": "BrentLab/comparative",
-                                        "dataset": "comp_data",
-                                        "via_field": "binding_id",
-                                    }
-                                ],
-                            }
-                        }
-                    },
-                    "BrentLab/comparative": {
-                        "dataset": {
-                            "comp_data": {
-                                "dto_fdr": {"field": "dto_fdr"},
-                                "dto_pvalue": {"field": "dto_empirical_pvalue"},
-                            }
-                        }
-                    },
-                }
-            }
-            yaml.dump(config, f)
-            config_path = f.name
-
-        try:
-            vdb = VirtualDB(config_path)
-            field_mapping = vdb._get_comparative_fields_for_dataset(
-                "BrentLab/primary", "primary_data"
-            )
-
-            # Should have dto_fdr and dto_pvalue, but NOT binding_id (via_field)
-            assert "dto_fdr" in field_mapping
-            assert "dto_pvalue" in field_mapping
-            assert "binding_id" not in field_mapping
-
-            # Check mapping structure
-            assert field_mapping["dto_fdr"]["comp_repo"] == "BrentLab/comparative"
-            assert field_mapping["dto_fdr"]["comp_dataset"] == "comp_data"
-            assert field_mapping["dto_fdr"]["via_field"] == "binding_id"
-        finally:
-            Path(config_path).unlink()
-
-    def test_get_comparative_fields_no_links(self):
-        """Test that datasets without comparative links return empty mapping."""
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
-            config = {
-                "repositories": {
-                    "BrentLab/primary": {
-                        "dataset": {
-                            "primary_data": {"sample_id": {"field": "sample_id"}}
-                        }
-                    }
-                }
-            }
-            yaml.dump(config, f)
-            config_path = f.name
-
-        try:
-            vdb = VirtualDB(config_path)
-            field_mapping = vdb._get_comparative_fields_for_dataset(
-                "BrentLab/primary", "primary_data"
-            )
-            assert field_mapping == {}
-        finally:
-            Path(config_path).unlink()
-
-    def test_get_comparative_analyses(self):
-        """Test getting comparative analysis relationships."""
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
-            config = {
-                "repositories": {
-                    "BrentLab/primary": {
-                        "dataset": {
-                            "primary_data": {
-                                "sample_id": {"field": "sample_id"},
-                                "comparative_analyses": [
-                                    {
-                                        "repo": "BrentLab/comparative",
-                                        "dataset": "comp_data",
-                                        "via_field": "binding_id",
-                                    }
-                                ],
-                            }
-                        }
-                    },
-                    "BrentLab/comparative": {
-                        "dataset": {"comp_data": {"dto_fdr": {"field": "dto_fdr"}}}
-                    },
-                }
-            }
-            yaml.dump(config, f)
-            config_path = f.name
-
-        try:
-            vdb = VirtualDB(config_path)
-            info = vdb.get_comparative_analyses()
-
-            # Check primary to comparative mapping
-            assert "BrentLab/primary/primary_data" in info["primary_to_comparative"]
-            links = info["primary_to_comparative"]["BrentLab/primary/primary_data"]
-            assert len(links) == 1
-            assert links[0]["comparative_repo"] == "BrentLab/comparative"
-            assert links[0]["comparative_dataset"] == "comp_data"
-            assert links[0]["via_field"] == "binding_id"
-
-            # Check comparative fields
-            assert "BrentLab/comparative/comp_data" in info["comparative_fields"]
-            assert (
-                "dto_fdr"
-                in info["comparative_fields"]["BrentLab/comparative/comp_data"]
-            )
-        finally:
-            Path(config_path).unlink()
-
-    def test_get_comparative_analyses_filtered(self):
-        """Test filtering comparative analyses by repo and config."""
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
-            config = {
-                "repositories": {
-                    "BrentLab/primary1": {
-                        "dataset": {
-                            "data1": {
-                                "sample_id": {"field": "sample_id"},
-                                "comparative_analyses": [
-                                    {
-                                        "repo": "BrentLab/comp",
-                                        "dataset": "comp_data",
-                                        "via_field": "id1",
-                                    }
-                                ],
-                            }
-                        }
-                    },
-                    "BrentLab/primary2": {
-                        "dataset": {
-                            "data2": {
-                                "sample_id": {"field": "sample_id"},
-                                "comparative_analyses": [
-                                    {
-                                        "repo": "BrentLab/comp",
-                                        "dataset": "comp_data",
-                                        "via_field": "id2",
-                                    }
-                                ],
-                            }
-                        }
-                    },
-                }
-            }
-            yaml.dump(config, f)
-            config_path = f.name
-
-        try:
-            vdb = VirtualDB(config_path)
-
-            # Get all
-            all_info = vdb.get_comparative_analyses()
-            assert len(all_info["primary_to_comparative"]) == 2
-
-            # Filter by repo and config
-            filtered = vdb.get_comparative_analyses("BrentLab/primary1", "data1")
-            assert len(filtered["primary_to_comparative"]) == 1
-            assert "BrentLab/primary1/data1" in filtered["primary_to_comparative"]
-
-            # Filter by repo only
-            repo_filtered = vdb.get_comparative_analyses("BrentLab/primary2")
-            assert len(repo_filtered["primary_to_comparative"]) == 1
-            assert "BrentLab/primary2/data2" in repo_filtered["primary_to_comparative"]
-        finally:
-            Path(config_path).unlink()
-
 
 # Note: Full integration tests with real HuggingFace datasets would go here
 # but are excluded as they require network access and specific test datasets.
diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py
index f6dd12e..80992b5 100644
--- a/tfbpapi/virtual_db.py
+++ b/tfbpapi/virtual_db.py
@@ -64,9 +64,15 @@ def get_nested_value(data: dict, path: str) -> Any:
 
         List of dicts - extract property from each item:
             get_nested_value(
-                {"media": {"carbon_source": [{"compound": "glucose"},
-                {"compound": "galactose"}]}},
-                "media.carbon_source.compound"
+                {
+                    "media": {
+                        "carbon_source": [
+                            {"compound": "glucose"},
+                            {"compound": "galactose"},
+                        ]
+                    }
+                },
+                "media.carbon_source.compound",
             )
             Returns: ["glucose", "galactose"]
 
@@ -193,8 +199,6 @@ def __init__(self, config_path: Path | str, token: str | None = None):
         self.config = MetadataConfig.from_yaml(config_path)
         self.token = token
         self.cache: dict[tuple[str, str], pd.DataFrame] = {}
-        # Build mapping of comparative dataset references
-        self._comparative_links = self._build_comparative_links()
 
     def get_fields(
         self, repo_id: str | None = None, config_name: str | None = None
@@ -202,6 +206,8 @@ def get_fields(
         """
         Get list of queryable fields.
 
+        Includes fields from comparative analyses if configured.
+
         :param repo_id: Optional repository ID to filter to specific dataset
         :param config_name: Optional config name (required if repo_id provided)
         :return: List of field names
@@ -217,7 +223,23 @@ def get_fields(
         if repo_id is not None and config_name is not None:
             # Get fields for specific dataset
             mappings = self.config.get_property_mappings(repo_id, config_name)
-            return sorted(mappings.keys())
+            fields = set(mappings.keys())
+
+            # Add fields from comparative analyses
+            repo_config = self.config.get_repository_config(repo_id)
+            if repo_config and repo_config.dataset:
+                dataset_config = repo_config.dataset.get(config_name)
+                if dataset_config and dataset_config.comparative_analyses:
+                    for comp_analysis in dataset_config.comparative_analyses:
+                        comp_mappings = self.config.get_property_mappings(
+                            comp_analysis.repo, comp_analysis.dataset
+                        )
+                        # Add comparative fields (exclude via_field)
+                        for field in comp_mappings.keys():
+                            if field != comp_analysis.via_field:
+                                fields.add(field)
+
+            return sorted(fields)
 
         if repo_id is not None or config_name is not None:
             raise ValueError(
@@ -231,16 +253,21 @@ def get_fields(
             all_fields.update(repo_config.properties.keys())
             # Add dataset-specific fields
             if repo_config.dataset:
-                for dataset_config in repo_config.dataset.values():
-                    # DatasetVirtualDBConfig stores property mappings in model_extra
-                    if (
-                        hasattr(dataset_config, "model_extra")
-                        and dataset_config.model_extra
-                    ):
-                        all_fields.update(dataset_config.model_extra.keys())
-                    # Also include special fields if they exist
-                    if dataset_config.sample_id:
-                        all_fields.add("sample_id")
+                for config_name, dataset_config in repo_config.dataset.items():
+                    # Get property mappings (excludes comparative_analyses)
+                    mappings = self.config.get_property_mappings(repo_id, config_name)
+                    all_fields.update(mappings.keys())
+
+                    # Add fields from comparative analyses
+                    if dataset_config.comparative_analyses:
+                        for comp_analysis in dataset_config.comparative_analyses:
+                            comp_mappings = self.config.get_property_mappings(
+                                comp_analysis.repo, comp_analysis.dataset
+                            )
+                            # Add comparative fields (exclude via_field)
+                            for field in comp_mappings.keys():
+                                if field != comp_analysis.via_field:
+                                    all_fields.add(field)
 
         return sorted(all_fields)
 
@@ -326,101 +353,6 @@ def get_unique_values(
         else:
             return sorted(all_values)
 
-    def get_comparative_analyses(
-        self, repo_id: str | None = None, config_name: str | None = None
-    ) -> dict[str, Any]:
-        """
-        Get information about comparative analysis relationships.
-
-        Returns information about which comparative datasets are available
-        and how they link to primary datasets. Useful for discovering
-        what cross-dataset analyses can be performed.
-
-        :param repo_id: Optional repository ID to filter to specific repo
-        :param config_name: Optional config name (requires repo_id)
-        :return: Dictionary with two keys:
-                 - "primary_to_comparative": Maps primary datasets to their
-                   comparative analyses
-                 - "comparative_fields": Maps comparative datasets to fields
-                   available for joining
-        :raises ValueError: If config_name provided without repo_id
-
-        Examples:
-            Get all comparative analysis relationships:
-                info = vdb.get_comparative_analyses()
-
-            Get relationships for specific primary dataset:
-                info = vdb.get_comparative_analyses(
-                    "BrentLab/callingcards", "annotated_features"
-                )
-
-        """
-        if config_name and not repo_id:
-            raise ValueError("repo_id required when config_name is specified")
-
-        primary_to_comparative: dict[str, list[dict[str, str]]] = {}
-        comparative_fields: dict[str, list[str]] = {}
-
-        # Filter links based on parameters
-        if repo_id and config_name:
-            # Specific dataset requested
-            links_to_process = {
-                (repo_id, config_name): self._comparative_links.get(
-                    (repo_id, config_name), {}
-                )
-            }
-        elif repo_id:
-            # All configs in specific repo
-            links_to_process = {
-                k: v for k, v in self._comparative_links.items() if k[0] == repo_id
-            }
-        else:
-            # All links
-            links_to_process = self._comparative_links
-
-        # Build primary to comparative mapping
-        for (prim_repo, prim_config), link_info in links_to_process.items():
-            if "comparative_analyses" not in link_info:
-                continue
-
-            dataset_key = f"{prim_repo}/{prim_config}"
-            primary_to_comparative[dataset_key] = []
-
-            for ca in link_info["comparative_analyses"]:
-                primary_to_comparative[dataset_key].append(
-                    {
-                        "comparative_repo": ca["repo"],
-                        "comparative_dataset": ca["dataset"],
-                        "via_field": ca["via_field"],
-                    }
-                )
-
-                # Track which fields are available from comparative datasets
-                comp_key = f"{ca['repo']}/{ca['dataset']}"
-                if comp_key not in comparative_fields:
-                    # Get fields from the comparative dataset
-                    # First try config mappings
-                    comp_fields = self.get_fields(ca["repo"], ca["dataset"])
-
-                    # If no mappings, get actual fields from DataCard
-                    if not comp_fields:
-                        try:
-                            card = DataCard(ca["repo"], token=self.token)
-                            config = card.get_config(ca["dataset"])
-                            if config and config.dataset_info:
-                                comp_fields = [
-                                    f.name for f in config.dataset_info.features
-                                ]
-                        except Exception:
-                            comp_fields = []
-
-                    comparative_fields[comp_key] = comp_fields
-
-        return {
-            "primary_to_comparative": primary_to_comparative,
-            "comparative_fields": comparative_fields,
-        }
-
     def query(
         self,
         filters: dict[str, Any] | None = None,
@@ -475,59 +407,13 @@ def query(
             if metadata_df.empty:
                 continue
 
-            # Separate filters into primary and comparative
-            primary_filters = {}
-            comparative_filters = {}
+            # Apply filters
             if filters:
-                # Get comparative field mapping
-                comp_field_mapping = self._get_comparative_fields_for_dataset(
-                    repo_id, config_name
-                )
-                for field, value in filters.items():
-                    if field in comp_field_mapping:
-                        comparative_filters[field] = value
-                    else:
-                        primary_filters[field] = value
-
-            # Apply primary filters first
-            if primary_filters:
-                metadata_df = self._apply_filters(
-                    metadata_df, primary_filters, repo_id, config_name
-                )
-
-            # Enrich with comparative data if needed
-            # IMPORTANT: Do this BEFORE getting complete data so comparative fields
-            # are joined at the sample level, not measurement level
-            # This happens when: fields are requested from comparative datasets
-            # OR when filtering on comparative fields
-            if fields or comparative_filters:
-                comp_field_mapping = self._get_comparative_fields_for_dataset(
-                    repo_id, config_name
-                )
-                if fields:
-                    requested_comp_fields = [
-                        f for f in fields if f in comp_field_mapping
-                    ]
-                # Also need fields that are filtered on
-                filtered_comp_fields = [
-                    f for f in comparative_filters.keys() if f in comp_field_mapping
-                ]
-                all_comp_fields = list(
-                    set(requested_comp_fields + filtered_comp_fields)
-                )
-                if all_comp_fields:
-                    metadata_df = self._enrich_with_comparative_data(
-                        metadata_df, repo_id, config_name, all_comp_fields
-                    )
-
-            # Apply comparative filters after enrichment
-            if comparative_filters:
                 metadata_df = self._apply_filters(
-                    metadata_df, comparative_filters, repo_id, config_name
+                    metadata_df, filters, repo_id, config_name
                 )
 
             # If complete=True, join with full data
-            # Do this AFTER comparative enrichment so DTO fields are already added
             if complete:
                 sample_ids = metadata_df["sample_id"].tolist()
                 if sample_ids:
@@ -547,9 +433,21 @@ def query(
                 for field in fields:
                     if field in metadata_df.columns and field not in keep_cols:
                         keep_cols.append(field)
+
+                # IMPORTANT: Also include fields used in filters
+                # This ensures that filtered fields are always returned,
+                # even if not in fields parameter
+                if filters:
+                    for filter_field in filters.keys():
+                        if (
+                            filter_field in metadata_df.columns
+                            and filter_field not in keep_cols
+                        ):
+                            keep_cols.append(filter_field)
+
                 metadata_df = metadata_df[keep_cols].copy()
 
-            # Add dataset identifier
+            # Add dataset identifier (ensure copy before modifying)
             if "dataset_id" not in metadata_df.columns:
                 metadata_df = metadata_df.copy()
                 metadata_df["dataset_id"] = f"{repo_id}/{config_name}"
@@ -562,6 +460,117 @@ def query(
         # Concatenate results, filling NaN for missing columns
         return pd.concat(results, ignore_index=True, sort=False)
 
+    def query_dto(
+        self,
+        binding_dataset: tuple[str, str],
+        perturbation_dataset: tuple[str, str],
+        binding_filters: dict[str, Any] | None = None,
+        perturbation_filters: dict[str, Any] | None = None,
+        dto_filters: dict[str, Any] | None = None,
+        fields: list[str] | None = None,
+    ) -> pd.DataFrame:
+        """
+        Query dto data filtered by binding and perturbation datasets.
+
+        This method uses the existing query() function to get binding dataset data
+        (which automatically includes DTO fields via comparative_analyses join),
+        then filters by perturbation_id using pandas.
+
+        :param binding_dataset: (repo_id, config_name) for binding_id source
+        :param perturbation_dataset: (repo_id, config_name) for perturbation_id source
+        :param binding_filters: Filters to apply on binding dataset
+        :param perturbation_filters: Filters to apply on perturbation dataset
+        :param dto_filters: Filters on DTO fields
+        :        (e.g., {"pvalue": ("<=", 0.01)})
+        :param fields: Fields to return (None = all fields including DTO fields)
+        :return: DataFrame with matching DTO records
+
+        Examples:
+            # Basic usage: query DTO intersection
+            result = vdb.query_dto(
+                binding_dataset=("BrentLab/harbison_2004", "harbison_2004"),
+                perturbation_dataset=("BrentLab/hackett_2020", "hackett_2020"),
+                dto_filters={"pvalue": ("<=", 0.01)}
+            )
+
+            # With source dataset filters
+            result = vdb.query_dto(
+                binding_dataset=("BrentLab/harbison_2004", "harbison_2004"),
+                perturbation_dataset=("BrentLab/hackett_2020", "hackett_2020"),
+                binding_filters={"carbon_source": "glucose"},
+                perturbation_filters={"temperature_celsius": 30},
+                dto_filters={"pvalue": ("<=", 0.01), "fdr": ("<=", 0.05)},
+                fields=["sample_id", "binding_id", "perturbation_id", "pvalue", "fdr"]
+            )
+
+        """
+        # Step 1: Query binding dataset
+        binding_df = self.query(
+            datasets=[binding_dataset],
+            filters=binding_filters,
+            fields=fields,  # If fields specified, query will handle it
+        )
+
+        if binding_df.empty:
+            return pd.DataFrame()
+
+        # Check if perturbation_id column exists (from DTO join)
+        if "perturbation_id" not in binding_df.columns:
+            # No DTO data joined, return empty
+            return pd.DataFrame()
+
+        # Step 2: Query perturbation dataset to get sample_ids
+        perturbation_repo, perturbation_config = perturbation_dataset
+        perturbation_df = self.query(
+            datasets=[perturbation_dataset],
+            filters=perturbation_filters,
+            fields=["sample_id"],
+        )
+
+        if perturbation_df.empty or "sample_id" not in perturbation_df.columns:
+            return pd.DataFrame()
+
+        # Step 3: Build composite IDs for perturbation dataset (with case variants)
+        perturbation_ids = set()
+        for sample_id in perturbation_df["sample_id"].astype(str).unique():
+            # Original format
+            perturbation_ids.add(
+                f"{perturbation_repo};{perturbation_config};{sample_id}"
+            )
+            # Capitalized variant (e.g., hackett_2020 -> Hackett_2020)
+            if "/" in perturbation_repo:
+                parts = perturbation_repo.split("/", 1)
+                if len(parts) == 2 and parts[1]:
+                    alt_repo = f"{parts[0]}/{parts[1][0].upper()}{parts[1][1:]}"
+                    perturbation_ids.add(
+                        f"{alt_repo};{perturbation_config};{sample_id}"
+                    )
+
+        # Step 4: Filter binding_df to only keep rows where perturbation_id matches
+        # Handle NaN values in perturbation_id
+        result_df = binding_df[
+            binding_df["perturbation_id"].isin(perturbation_ids)
+        ].copy()
+
+        if result_df.empty:
+            return pd.DataFrame()
+
+        # Step 5: Apply DTO filters if provided
+        if dto_filters:
+            # Get binding dataset info for filter application
+            binding_repo, binding_config = binding_dataset
+            result_df = self._apply_filters(
+                result_df, dto_filters, binding_repo, binding_config
+            )
+
+        # Step 6: Select requested fields if specified
+        if fields:
+            available_fields = [f for f in fields if f in result_df.columns]
+            if available_fields:
+                result_df = result_df[available_fields].copy()
+
+        return result_df
+
     def materialize_views(self, datasets: list[tuple[str, str]] | None = None) -> None:
         """
         Build and cache metadata DataFrames for faster subsequent queries.
@@ -605,296 +614,6 @@ def invalidate_cache(self, datasets: list[tuple[str, str]] | None = None) -> Non
                 if dataset_key in self.cache:
                     del self.cache[dataset_key]
 
-    def _build_comparative_links(self) -> dict[tuple[str, str], dict[str, Any]]:
-        """
-        Build mapping of primary datasets to their comparative dataset references.
-
-        Returns dict keyed by (repo_id, config_name) with value being dict: {
-        "comparative_analyses": [         {             "repo": comparative_repo_id,
-        "dataset": comparative_config_name,             "via_field":
-        field_name_with_composite_ids         }     ] }
-
-        """
-        links: dict[tuple[str, str], dict[str, Any]] = {}
-
-        for repo_id, repo_config in self.config.repositories.items():
-            if not repo_config.dataset:
-                continue
-
-            for config_name, dataset_config in repo_config.dataset.items():
-                if dataset_config.comparative_analyses:
-                    links[(repo_id, config_name)] = {
-                        "comparative_analyses": [
-                            {
-                                "repo": ca.repo,
-                                "dataset": ca.dataset,
-                                "via_field": ca.via_field,
-                            }
-                            for ca in dataset_config.comparative_analyses
-                        ]
-                    }
-
-        return links
-
-    def _get_comparative_fields_for_dataset(
-        self, repo_id: str, config_name: str
-    ) -> dict[str, dict[str, str]]:
-        """
-        Get mapping of comparative fields available for a primary dataset.
-
-        :param repo_id: Primary dataset repository ID
-        :param config_name: Primary dataset config name
-        :return: Dict mapping field_name to comparative dataset info
-                 {field_name: {
-                     "comp_repo": comparative_repo_id,
-                     "comp_dataset": comparative_dataset_name,
-                     "via_field": field_with_composite_ids
-                 }}
-
-        Example:
-            For callingcards dataset linked to DTO via binding_id:
-            {
-                "dto_fdr": {
-                    "comp_repo": "BrentLab/yeast_comparative_analysis",
-                    "comp_dataset": "dto",
-                    "via_field": "binding_id"
-                },
-                "dto_empirical_pvalue": {...}
-            }
-
-        """
-        field_mapping: dict[str, dict[str, str]] = {}
-
-        # Get comparative analyses for this dataset
-        links = self._comparative_links.get((repo_id, config_name), {})
-        if "comparative_analyses" not in links:
-            return field_mapping
-
-        # For each comparative dataset, get its fields
-        for ca in links["comparative_analyses"]:
-            comp_repo = ca["repo"]
-            comp_dataset = ca["dataset"]
-            via_field = ca["via_field"]
-
-            # Get fields from comparative dataset
-            comp_fields = self.get_fields(comp_repo, comp_dataset)
-
-            # If no fields from config, try DataCard
-            if not comp_fields:
-                try:
-                    from tfbpapi.datacard import DataCard
-
-                    card = DataCard(comp_repo, token=self.token)
-                    config = card.get_config(comp_dataset)
-                    if config and config.dataset_info:
-                        comp_fields = [f.name for f in config.dataset_info.features]
-                except Exception:
-                    comp_fields = []
-
-            # Map each field to this comparative dataset
-            for field_name in comp_fields:
-                # Skip the via_field itself (it's the join key)
-                if field_name == via_field:
-                    continue
-
-                field_mapping[field_name] = {
-                    "comp_repo": comp_repo,
-                    "comp_dataset": comp_dataset,
-                    "via_field": via_field,
-                }
-
-        return field_mapping
-
-    def _enrich_with_comparative_data(
-        self,
-        primary_df: pd.DataFrame,
-        repo_id: str,
-        config_name: str,
-        requested_fields: list[str],
-    ) -> pd.DataFrame:
-        """
-        Enrich primary dataset with fields from comparative datasets.
-
-        :param primary_df: Primary dataset DataFrame with sample_id column
-        :param repo_id: Primary dataset repository ID
-        :param config_name: Primary dataset config name
-        :param requested_fields: List of field names requested by user
-        :return: DataFrame enriched with comparative fields
-
-        """
-        # Get mapping of which fields come from which comparative datasets
-        comp_field_mapping = self._get_comparative_fields_for_dataset(
-            repo_id, config_name
-        )
-
-        if not comp_field_mapping:
-            return primary_df
-
-        # Find which requested fields are from comparative datasets
-        comp_fields_to_fetch = [f for f in requested_fields if f in comp_field_mapping]
-
-        if not comp_fields_to_fetch:
-            return primary_df
-
-        # Group fields by comparative dataset to minimize queries
-        by_comp_dataset: dict[tuple[str, str, str], list[str]] = {}
-        for field in comp_fields_to_fetch:
-            info = comp_field_mapping[field]
-            key = (info["comp_repo"], info["comp_dataset"], info["via_field"])
-            if key not in by_comp_dataset:
-                by_comp_dataset[key] = []
-            by_comp_dataset[key].append(field)
-
-        # For each comparative dataset, load and join
-        result_df = primary_df.copy()
-
-        for (comp_repo, comp_dataset, via_field), fields in by_comp_dataset.items():
-            try:
-                # Load comparative dataset using HfCacheManager
-                # but query the raw data table instead of metadata view
-                from tfbpapi.hf_cache_manager import HfCacheManager
-
-                comp_cache_mgr = HfCacheManager(
-                    comp_repo, duckdb_conn=duckdb.connect(":memory:"), token=self.token
-                )
-
-                # Get the config to load data
-                comp_config = comp_cache_mgr.get_config(comp_dataset)
-                if not comp_config:
-                    continue
-
-                # Load the data (this will download and register parquet files)
-                result = comp_cache_mgr._get_metadata_for_config(comp_config)
-                if not result.get("success", False):
-                    continue
-
-                # Now query the raw data table directly (not the metadata view)
-                # The raw table name is config_name without "metadata_" prefix
-                select_fields = [via_field] + fields
-                columns = ", ".join(select_fields)
-
-                # Query the actual parquet data by creating a view from the files
-                try:
-                    # Get file paths that were loaded
-                    import glob
-
-                    from huggingface_hub import snapshot_download
-
-                    cache_dir = snapshot_download(
-                        repo_id=comp_repo,
-                        repo_type="dataset",
-                        allow_patterns=f"{comp_dataset}/**/*.parquet",
-                        token=self.token,
-                    )
-
-                    parquet_files = glob.glob(
-                        f"{cache_dir}/{comp_dataset}/**/*.parquet", recursive=True
-                    )
-
-                    if not parquet_files:
-                        continue
-
-                    # Create a temporary view from parquet files
-                    temp_view = f"temp_{comp_dataset}_raw"
-                    files_sql = ", ".join([f"'{f}'" for f in parquet_files])
-                    comp_cache_mgr.duckdb_conn.execute(
-                        f"CREATE OR REPLACE VIEW {temp_view} AS "
-                        f"SELECT * FROM read_parquet([{files_sql}])"
-                    )
-
-                    # Query the view
-                    sql = f"SELECT {columns} FROM {temp_view}"
-                    comp_df = comp_cache_mgr.duckdb_conn.execute(sql).fetchdf()
-
-                except Exception:
-                    # If direct parquet loading fails, skip this comparative dataset
-                    continue
-
-                if comp_df.empty:
-                    continue
-
-                # Parse composite identifiers to extract sample_id
-                # via_field contains values like
-                # "BrentLab/harbison_2004;harbison_2004;123"
-                # We need to extract the third component and match on
-                # current repo/config
-                def extract_sample_id(composite_id: str) -> str | None:
-                    """Extract sample_id if composite matches current dataset."""
-                    if pd.isna(composite_id):
-                        return None
-                    try:
-                        parts = composite_id.split(";")
-                        if len(parts) != 3:
-                            return None
-                        # Check if this composite ID references our dataset
-                        if parts[0] == repo_id and parts[1] == config_name:
-                            return parts[2]
-                        return None
-                    except Exception:
-                        return None
-
-                comp_df["_join_sample_id"] = comp_df[via_field].apply(extract_sample_id)
-
-                # Convert _join_sample_id to match primary_df sample_id dtype
-                # This handles cases where sample_id is int but composite has string
-                if "_join_sample_id" in comp_df.columns:
-                    primary_dtype = primary_df["sample_id"].dtype
-                    if pd.api.types.is_integer_dtype(primary_dtype):
-                        # Convert to numeric, coercing errors to NaN
-                        comp_df["_join_sample_id"] = pd.to_numeric(
-                            comp_df["_join_sample_id"], errors="coerce"
-                        )
-                    elif pd.api.types.is_string_dtype(primary_dtype):
-                        comp_df["_join_sample_id"] = comp_df["_join_sample_id"].astype(
-                            str
-                        )
-
-                # Filter to only rows that match our dataset
-                comp_df = comp_df[comp_df["_join_sample_id"].notna()].copy()
-
-                if comp_df.empty:
-                    continue
-
-                # Drop the via_field column (we don't need it in results)
-                comp_df = comp_df.drop(columns=[via_field])
-
-                # Merge with primary data
-                result_df = result_df.merge(
-                    comp_df, left_on="sample_id", right_on="_join_sample_id", how="left"
-                )
-
-                # Drop the temporary join column
-                result_df = result_df.drop(columns=["_join_sample_id"])
-
-            except Exception:
-                # If enrichment fails for this comparative dataset, continue
-                continue
-
-        return result_df
-
-    @staticmethod
-    def _parse_composite_identifier(composite_id: str) -> tuple[str, str, str]:
-        """
-        Parse composite sample identifier into components.
-
-        :param composite_id: Composite ID in format "repo_id;config_name;sample_id"
-        :return: Tuple of (repo_id, config_name, sample_id)
-
-        Example:
-            _parse_composite_identifier(
-                "BrentLab/harbison_2004;harbison_2004;sample_42"
-            )
-            Returns: ("BrentLab/harbison_2004", "harbison_2004", "sample_42")
-
-        """
-        parts = composite_id.split(";")
-        if len(parts) != 3:
-            raise ValueError(
-                f"Invalid composite ID format: {composite_id}. "
-                "Expected 'repo_id;config_name;sample_id'"
-            )
-        return parts[0], parts[1], parts[2]
-
     def _build_metadata_table(
         self, repo_id: str, config_name: str, use_cache: bool = True
     ) -> pd.DataFrame:
@@ -941,19 +660,23 @@ def _build_metadata_table(
             # Get sample-level data from HuggingFace
             config = card.get_config(config_name)
 
-            # Check if this is a comparative dataset
-            from tfbpapi.models import DatasetType
-
-            is_comparative = (
-                config
-                and hasattr(config, "dataset_type")
-                and config.dataset_type == DatasetType.COMPARATIVE
-            )
+            # Check if sample_id exists in the data by trying a sample query
+            has_sample_id = False
+            try:
+                sample_df = cache_mgr.query(
+                    f"SELECT sample_id FROM {config_name} LIMIT 1", config_name
+                )
+                has_sample_id = "sample_id" in sample_df.columns
+            except Exception:
+                # If query fails, assume sample_id doesn't exist
+                has_sample_id = False
 
             if config and hasattr(config, "metadata_fields") and config.metadata_fields:
                 # Select only metadata fields
                 columns = ", ".join(config.metadata_fields)
-                if not is_comparative and "sample_id" not in config.metadata_fields:
+                # Only add sample_id field if it exists in the data
+                # and not already in metadata_fields
+                if has_sample_id and "sample_id" not in config.metadata_fields:
                     columns = f"sample_id, {columns}"
                 sql = f"SELECT DISTINCT {columns} FROM {config_name}"
             else:
@@ -962,9 +685,12 @@ def _build_metadata_table(
 
             df = cache_mgr.query(sql, config_name)
 
-            # For non-comparative datasets: one row per sample_id
-            # For comparative datasets: keep all rows (each row is a relationship)
-            if not is_comparative and "sample_id" in df.columns:
+            # If sample_id doesn't exist, generate from row number
+            if "sample_id" not in df.columns and not df.empty:
+                df["sample_id"] = df.index.astype(str)
+
+            # One row per sample_id
+            if "sample_id" in df.columns:
                 df = df.groupby("sample_id").first().reset_index()
 
             # Add repo-level metadata as columns
@@ -976,8 +702,8 @@ def _build_metadata_table(
             if field_metadata:
                 df = self._add_field_metadata(df, field_metadata)
 
-            # Apply dtype conversions to DataFrame columns
-            df = self._apply_column_dtypes(df, property_mappings)
+            # Join comparative analyses data if configured
+            df = self._join_comparative_analyses(df, repo_id, config_name)
 
             # Cache result
             if use_cache:
@@ -985,73 +711,9 @@ def _build_metadata_table(
 
             return df
 
-        except Exception as e:
-            # Log error for debugging with full traceback
-            import traceback
-
-            print(f"Error downloading metadata for {config_name}: {e}")
-            traceback.print_exc()
-            # Return empty DataFrame on error
+        except Exception:
             return pd.DataFrame()
 
-    def _apply_column_dtypes(
-        self, df: pd.DataFrame, property_mappings: dict[str, PropertyMapping]
-    ) -> pd.DataFrame:
-        """
-        Apply dtype conversions to DataFrame columns based on property mappings.
-
-        :param df: DataFrame to apply conversions to
-        :param property_mappings: Property mappings with dtype specifications
-        :return: DataFrame with converted column dtypes
-
-        """
-        for prop_name, mapping in property_mappings.items():
-            # Skip if no dtype specified or column doesn't exist
-            if not mapping.dtype or prop_name not in df.columns:
-                continue
-
-            # Convert column dtype
-            try:
-                if mapping.dtype == "numeric":
-                    df[prop_name] = pd.to_numeric(df[prop_name], errors="coerce")
-                elif mapping.dtype == "bool":
-                    df[prop_name] = df[prop_name].astype(bool)
-                elif mapping.dtype == "string":
-                    df[prop_name] = df[prop_name].astype(str)
-            except (ValueError, TypeError):
-                # Conversion failed, leave as is
-                pass
-
-        return df
-
-    def _convert_dtype(self, value: Any, dtype: str) -> Any:
-        """
-        Convert value to specified data type.
-
-        :param value: The value to convert to a given `dtype`
-        :param dtype: Target data type ("numeric", "bool", "string")
-
-        :return: Converted value or None if conversion fails
-
-        """
-        if value is None:
-            return None
-
-        try:
-            if dtype == "numeric":
-                # Try float first (handles both int and float)
-                return float(value)
-            elif dtype == "bool":
-                return bool(value)
-            elif dtype == "string":
-                return str(value)
-            else:
-                # Unknown dtype, pass through unchanged
-                return value
-        except (ValueError, TypeError):
-            # Conversion failed, return None
-            return None
-
     def _extract_repo_level(
         self,
         card: DataCard,
@@ -1085,12 +747,14 @@ def _extract_repo_level(
                 continue
 
             # Build full path
-            # Note: `conditions` is already the experimental_conditions dict,
-            # so we don't add the prefix
             full_path = mapping.path
 
+            # Skip if path is None (shouldn't happen for repo-level, but be safe)
+            if full_path is None:
+                continue
+
             # Get value at path
-            value = get_nested_value(conditions, full_path)  # type: ignore
+            value = get_nested_value(conditions, full_path)
 
             # Handle missing values
             missing_label = self.config.missing_value_labels.get(prop_name)
@@ -1102,12 +766,6 @@ def _extract_repo_level(
             # Ensure value is a list
             actual_values = [value] if not isinstance(value, list) else value
 
-            # Apply dtype conversion if specified
-            if mapping.dtype:
-                actual_values = [
-                    self._convert_dtype(v, mapping.dtype) for v in actual_values
-                ]
-
             # Normalize using aliases
             aliases = self.config.factor_aliases.get(prop_name)
             normalized_values = [
@@ -1136,18 +794,17 @@ def _extract_field_level(
         field_metadata: dict[str, dict[str, Any]] = {}
 
         # Group property mappings by field
-        field_mappings: dict[str, dict[str, PropertyMapping]] = {}
+        field_mappings: dict[str, dict[str, str | None]] = {}
         for prop_name, mapping in property_mappings.items():
-            # Only process if field is specified AND path exists
-            # (no path means it's just a column alias, not metadata extraction)
-            if mapping.field is not None and mapping.path is not None:
+            if mapping.field is not None:
                 field_name = mapping.field
                 if field_name not in field_mappings:
                     field_mappings[field_name] = {}
-                field_mappings[field_name][prop_name] = mapping
+                # Store path (can be None for column aliases)
+                field_mappings[field_name][prop_name] = mapping.path
 
         # Process each field that has mappings
-        for field_name, prop_mappings_dict in field_mappings.items():
+        for field_name, prop_paths in field_mappings.items():
             # Get field definitions
             definitions = card.get_field_definitions(config_name, field_name)
             if not definitions:
@@ -1158,9 +815,13 @@ def _extract_field_level(
                 if field_value not in field_metadata:
                     field_metadata[field_value] = {}
 
-                for prop_name, mapping in prop_mappings_dict.items():
-                    # Get value at path
-                    value = get_nested_value(definition, mapping.path)  # type: ignore
+                for prop_name, path in prop_paths.items():
+                    # Handle path=None case: use field_value directly
+                    if path is None:
+                        value = field_value
+                    else:
+                        # Get value at path
+                        value = get_nested_value(definition, path)
 
                     # Handle missing values
                     missing_label = self.config.missing_value_labels.get(prop_name)
@@ -1172,12 +833,6 @@ def _extract_field_level(
                     # Ensure value is a list
                     actual_values = [value] if not isinstance(value, list) else value
 
-                    # Apply dtype conversion if specified
-                    if mapping.dtype:
-                        actual_values = [
-                            self._convert_dtype(v, mapping.dtype) for v in actual_values
-                        ]
-
                     # Normalize using aliases
                     aliases = self.config.factor_aliases.get(prop_name)
                     normalized_values = [
@@ -1243,23 +898,31 @@ def _apply_filters(
             # Handle numeric range filters
             if isinstance(filter_value, tuple):
                 operator = filter_value[0]
+                # For numeric comparisons, try to convert column to numeric
+                # (normalize_value returns strings,
+                # but we need numeric for range queries)
+                try:
+                    df_field = pd.to_numeric(df[field], errors="coerce")
+                except (ValueError, TypeError):
+                    df_field = df[field]
+
                 if operator == "between" and len(filter_value) == 3:
                     df = df[
-                        (df[field] >= filter_value[1]) & (df[field] <= filter_value[2])
+                        (df_field >= filter_value[1]) & (df_field <= filter_value[2])
                     ]
                 elif operator in (">=", ">", "<=", "<", "==", "!="):
                     if operator == ">=":
-                        df = df[df[field] >= filter_value[1]]
+                        df = df[df_field >= filter_value[1]]
                     elif operator == ">":
-                        df = df[df[field] > filter_value[1]]
+                        df = df[df_field > filter_value[1]]
                     elif operator == "<=":
-                        df = df[df[field] <= filter_value[1]]
+                        df = df[df_field <= filter_value[1]]
                     elif operator == "<":
-                        df = df[df[field] < filter_value[1]]
+                        df = df[df_field < filter_value[1]]
                     elif operator == "==":
-                        df = df[df[field] == filter_value[1]]
+                        df = df[df_field == filter_value[1]]
                     elif operator == "!=":
-                        df = df[df[field] != filter_value[1]]
+                        df = df[df_field != filter_value[1]]
             else:
                 # Exact match with alias expansion
                 aliases = self.config.factor_aliases.get(field)
@@ -1273,9 +936,11 @@ def _apply_filters(
                     df = df[df[field].isin(expanded_values)]
                 else:
                     # No aliases, exact match
-                    df = df[df[field] == filter_value]
+                    # Handle type conversion: normalize_value returns strings,
+                    # so convert filter_value to string for comparison
+                    df = df[df[field] == str(filter_value)]
 
-        return df
+        return df.copy()
 
     def _get_complete_data(
         self,
@@ -1331,6 +996,264 @@ def _get_complete_data(
         except Exception:
             return pd.DataFrame()
 
+    @staticmethod
+    def _parse_composite_identifier(composite_id: str) -> tuple[str, str, str]:
+        """
+        Parse composite identifier into repo_id, config_name, and sample_id.
+
+        Format: "repo_id;config_name;sample_id"
+
+        :param composite_id: Composite identifier string
+        :return: Tuple of (repo_id, config_name, sample_id)
+        :raises ValueError: If format is invalid
+
+        Example:
+            >>> VirtualDB._parse_composite_identifier(
+            ...     "BrentLab/harbison_2004;harbison_2004;42"
+            ... )
+            ("BrentLab/harbison_2004", "harbison_2004", "42")
+
+        """
+        parts = composite_id.split(";")
+        if len(parts) != 3:
+            raise ValueError(
+                f"Invalid composite ID format: {composite_id}. "
+                "Expected format: 'repo_id;config_name;sample_id'"
+            )
+        return tuple(parts)  # type: ignore
+
+    def _join_comparative_analyses(
+        self, df: pd.DataFrame, repo_id: str, config_name: str
+    ) -> pd.DataFrame:
+        """
+        Join comparative analyses data to the primary dataset DataFrame.
+
+        For each comparative_analysis configured for this dataset, loads the comparative
+        dataset directly via SQL and joins fields via composite identifiers.
+
+        :param df: Primary dataset DataFrame with sample_id column
+        :param repo_id: Repository ID of the primary dataset
+        :param config_name: Config name of the primary dataset
+        :return: DataFrame with joined comparative analysis fields
+
+        """
+        if df.empty or "sample_id" not in df.columns:
+            return df
+
+        # Get dataset configuration
+        repo_config = self.config.get_repository_config(repo_id)
+        if not repo_config or not repo_config.dataset:
+            return df
+
+        dataset_config = repo_config.dataset.get(config_name)
+        if not dataset_config or not dataset_config.comparative_analyses:
+            return df
+
+        result_df = df.copy()
+
+        # Process each comparative analysis
+        for comp_analysis in dataset_config.comparative_analyses:
+            try:
+                # Build composite identifier column for join
+                # Format: "repo_id;config_name;sample_id"
+                temp_composite_col = "_temp_composite_id"
+                result_df[temp_composite_col] = f"{repo_id};{config_name};" + result_df[
+                    "sample_id"
+                ].astype(str)
+
+                # Get property mappings for comparative dataset
+                comp_mappings = self.config.get_property_mappings(
+                    comp_analysis.repo, comp_analysis.dataset
+                )
+
+                # Build mapping from property names to actual column names
+                # PropertyMapping.field points to the actual column name in the dataset
+                prop_to_col: dict[str, str] = {}
+                for prop_name, mapping in comp_mappings.items():
+                    if mapping.field:
+                        prop_to_col[prop_name] = mapping.field
+                    else:
+                        prop_to_col[prop_name] = prop_name
+
+                # Get via_field actual column name
+                via_field = comp_analysis.via_field
+                via_field_col = prop_to_col.get(via_field, via_field)
+
+                # Determine which fields to select from comparative dataset
+                # Exclude via_field and sample_id from the join fields
+                fields_to_join_props = [
+                    prop_name
+                    for prop_name in prop_to_col.keys()
+                    if prop_name not in ["sample_id", via_field, "dataset_id"]
+                ]
+
+                if not fields_to_join_props:
+                    result_df = result_df.drop(columns=[temp_composite_col])
+                    continue
+
+                # Build SQL columns: actual column names
+                sql_columns = [via_field_col]
+                sql_columns.extend([prop_to_col[prop] for prop in fields_to_join_props])
+                sql_columns_str = ", ".join(sql_columns)
+
+                # Load comparative dataset directly via SQL
+                # This bypasses _build_metadata_table which may not work
+                # for comparative datasets
+                comp_cache_mgr = HfCacheManager(
+                    comp_analysis.repo,
+                    duckdb_conn=duckdb.connect(":memory:"),
+                    token=self.token,
+                )
+
+                # Get the actual table name (metadata_{config_name})
+                # to avoid string replacement issues
+                comp_config = comp_cache_mgr.get_config(comp_analysis.dataset)
+                if not comp_config:
+                    result_df = result_df.drop(columns=[temp_composite_col])
+                    continue
+
+                # Load the config to get the actual table name
+                config_result = comp_cache_mgr._get_metadata_for_config(
+                    comp_config, force_refresh=False
+                )
+                if not config_result.get("success", False):
+                    result_df = result_df.drop(columns=[temp_composite_col])
+                    continue
+
+                actual_table_name = config_result.get("table_name")
+                if not actual_table_name:
+                    actual_table_name = f"metadata_{comp_analysis.dataset}"
+
+                # Build WHERE clause to filter only matching records
+                # Try both original repo_id and capitalized version
+                # (e.g., hackett_2020 -> Hackett_2020)
+                composite_ids = result_df[temp_composite_col].unique().tolist()
+
+                # Generate alternative repo_id format with capitalized
+                # first letter after slash
+                # e.g., "BrentLab/hackett_2020" -> "BrentLab/Hackett_2020"
+                alternative_repo_id = repo_id
+                if "/" in repo_id:
+                    parts = repo_id.split("/", 1)
+                    if len(parts) == 2 and parts[1]:
+                        # Capitalize first letter of dataset name
+                        alternative_repo_id = (
+                            f"{parts[0]}/{parts[1][0].upper()}{parts[1][1:]}"
+                        )
+
+                # Build composite IDs with both formats
+                all_composite_ids = set(composite_ids)  # Original format
+                if alternative_repo_id != repo_id:
+                    # Add alternative format for each sample_id
+                    for sample_id in result_df["sample_id"].astype(str).unique():
+                        alt_id = f"{alternative_repo_id};{config_name};{sample_id}"
+                        all_composite_ids.add(alt_id)
+
+                # Add forward-slash format variants
+                # (e.g., "BrentLab/rossi_2021/rossi_2021_af_combined;{sample_id}")
+                # This handles cases where DTO data uses "/" instead of ";"
+                # between repo_id and config_name
+                for sample_id in result_df["sample_id"].astype(str).unique():
+                    # Format: "repo_id/config_name;sample_id"
+                    slash_format_id = f"{repo_id}/{config_name};{sample_id}"
+                    all_composite_ids.add(slash_format_id)
+                    # Also add capitalized variant if applicable
+                    if alternative_repo_id != repo_id:
+                        alt_slash_format_id = (
+                            f"{alternative_repo_id}/{config_name};{sample_id}"
+                        )
+                        all_composite_ids.add(alt_slash_format_id)
+
+                # Escape single quotes in composite IDs
+                escaped_ids = [cid.replace("'", "''") for cid in all_composite_ids]
+                id_list = ", ".join([f"'{cid}'" for cid in escaped_ids])
+
+                # Use actual table name directly to avoid column name replacement issues
+                sql = f"""
+                    SELECT {sql_columns_str}
+                    FROM {actual_table_name}
+                    WHERE {via_field_col} IN ({id_list})
+                """
+
+                # Execute query directly instead of using query() method
+                # to avoid string replacement
+                try:
+                    comp_df = comp_cache_mgr.duckdb_conn.execute(sql).fetchdf()
+
+                except Exception:
+                    result_df = result_df.drop(columns=[temp_composite_col])
+                    continue
+
+                if comp_df.empty:
+                    result_df = result_df.drop(columns=[temp_composite_col])
+                    continue
+
+                # Rename columns to use property names (config names)
+                # instead of raw column names
+                rename_dict = {via_field_col: via_field}
+                for prop_name in fields_to_join_props:
+                    actual_col = prop_to_col[prop_name]
+                    if actual_col != prop_name:
+                        rename_dict[actual_col] = prop_name
+
+                comp_df = comp_df.rename(columns=rename_dict)
+
+                # Map DTO composite IDs back to original format for join
+                # This handles cases where DTO uses:
+                # 1. Capitalized repo_id (e.g., Hackett_2020)
+                # 2. Forward-slash format
+                #    (e.g., "BrentLab/rossi_2021/rossi_2021_af_combined;{sample_id}")
+                if via_field in comp_df.columns:
+                    # Create mapping from all alternative formats to original format
+                    id_mapping = {}
+                    for sample_id in result_df["sample_id"].astype(str).unique():
+                        original_id = f"{repo_id};{config_name};{sample_id}"
+
+                        # Add capitalized variant mapping
+                        if alternative_repo_id != repo_id:
+                            alt_id = f"{alternative_repo_id};{config_name};{sample_id}"
+                            id_mapping[alt_id] = original_id
+
+                        # Add forward-slash format mappings
+                        slash_format_id = f"{repo_id}/{config_name};{sample_id}"
+                        id_mapping[slash_format_id] = original_id
+                        if alternative_repo_id != repo_id:
+                            alt_slash_format_id = (
+                                f"{alternative_repo_id}/{config_name};{sample_id}"
+                            )
+                            id_mapping[alt_slash_format_id] = original_id
+
+                    # Map alternative format IDs to original format
+                    comp_df[temp_composite_col] = comp_df[via_field].map(
+                        lambda x: id_mapping.get(x, x) if x in id_mapping else x
+                    )
+                    # Use mapped column for join
+                    join_right_on = temp_composite_col
+                else:
+                    # No alternative format needed, use original via_field
+                    join_right_on = via_field
+
+                # Perform left join on composite identifier
+                result_df = result_df.merge(
+                    comp_df,
+                    left_on=temp_composite_col,
+                    right_on=join_right_on,
+                    how="left",
+                    suffixes=("", f"_{comp_analysis.dataset}"),
+                )
+
+                # Drop the temporary composite_id column
+                result_df = result_df.drop(columns=[temp_composite_col])
+
+            except Exception:
+
+                # Clean up temp column if it exists
+                if temp_composite_col in result_df.columns:
+                    result_df = result_df.drop(columns=[temp_composite_col])
+                continue
+
+        return result_df
+
     def __repr__(self) -> str:
         """String representation."""
         n_repos = len(self.config.repositories)