diff --git a/docs/tutorials/DTO_analysis.ipynb b/docs/tutorials/DTO_analysis.ipynb
new file mode 100644
index 0000000..4c11286
--- /dev/null
+++ b/docs/tutorials/DTO_analysis.ipynb
@@ -0,0 +1,968 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# DTO Analysis: Significance Filtering of Cross-Dataset Binding Samples\n",
+ "\n",
+ "This notebook is used to analyze the correlation between transcription factor (TF) binding data and perturbation data.\n",
+ "\n",
+ "## Analysis Objectives\n",
+ "\n",
+ "1. Select all binding samples with DTO P<=0.01 compared to **Hackett-2020-ZEV**.\n",
+ "2. Select all binding samples with DTO P<=0.01 compared to **Kemmeren-2014-TFKO**.\n",
+ "3. Find the intersection of the two sets above.\n",
+ "4. For each regulator in the active set, count the number of active samples.\n",
+ "\n",
+ "## Challenges and Additional Analysis\n",
+ "\n",
+ "- Explore the time point effects in the Hackett data.\n",
+ "- Analyze the impact of different time points on the DTO distribution.\n",
+ "- Select the optimal conditions (e.g., ZEV vs GEV) for each regulator."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import necessary libraries\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from collections import Counter\n",
+ "\n",
+ "# Set display options\n",
+ "pd.set_option('display.max_columns', None)\n",
+ "pd.set_option('display.max_rows', 100)\n",
+ "pd.set_option('display.width', None)\n",
+ "\n",
+ "# Set plot style\n",
+ "plt.style.use('seaborn-v0_8-whitegrid')\n",
+ "sns.set_palette('husl')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u2705 Configuration file saved at: /tmp/tmpv37ibelm/vdb_config.yaml\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Create VirtualDB configuration\n",
+ "# This configuration defines how to map the fields of different datasets and how to associate DTO comparative analysis data\n",
+ "\n",
+ "import tempfile\n",
+ "from pathlib import Path\n",
+ "\n",
+ "config_yaml = \"\"\"\n",
+ "repositories:\n",
+ " BrentLab/harbison_2004:\n",
+ " dataset:\n",
+ " harbison_2004:\n",
+ " sample_id:\n",
+ " field: sample_id\n",
+ " carbon_source:\n",
+ " field: condition\n",
+ " path: media.carbon_source.compound\n",
+ " temperature_celsius:\n",
+ " field: condition\n",
+ " path: temperature_celsius\n",
+ " dtype: numeric\n",
+ " environmental_condition:\n",
+ " field: condition\n",
+ " regualtor_locus_tag:\n",
+ " field: regulator_locus_tag\n",
+ " regulator_symbol:\n",
+ " field: regulator_symbol\n",
+ "\n",
+ " comparative_analyses:\n",
+ " - repo: BrentLab/yeast_comparative_analysis\n",
+ " dataset: dto\n",
+ " via_field: binding_id\n",
+ "\n",
+ " BrentLab/rossi_2021:\n",
+ " carbon_source: \n",
+ " path: media.carbon_source.compound\n",
+ " temperature_celsius: \n",
+ " path: temperature_celsius\n",
+ " dataset:\n",
+ " rossi_2021_af_combined:\n",
+ " sample_id: \n",
+ " field: sample_id\n",
+ " regulator_locus_tag:\n",
+ " field: regulator_locus_tag\n",
+ " target_locus_tag:\n",
+ " field: target_locus_tag\n",
+ "\n",
+ " comparative_analyses:\n",
+ " - repo: BrentLab/yeast_comparative_analysis\n",
+ " dataset: dto\n",
+ " via_field: binding_id\n",
+ "\n",
+ " BrentLab/mahendrawada_2025:\n",
+ " dataset:\n",
+ " reprocessed_diffcontrol_5prime:\n",
+ " sample_id:\n",
+ " field: sample_id\n",
+ " control_source:\n",
+ " field: control_source\n",
+ " regulator_locus_tag:\n",
+ " field: regulator_locus_tag\n",
+ " regulator_symbol:\n",
+ " field: regulator_symbol\n",
+ " environmental_condition:\n",
+ " field: condition\n",
+ " temperature_celsius:\n",
+ " field: condition\n",
+ " path: temperature_celsius\n",
+ " dtype: numeric\n",
+ " media_name:\n",
+ " field: condition\n",
+ " path: media.name\n",
+ " carbon_source:\n",
+ " field: condition\n",
+ " path: media.carbon_source\n",
+ "\n",
+ " comparative_analyses:\n",
+ " - repo: BrentLab/yeast_comparative_analysis\n",
+ " dataset: dto\n",
+ " via_field: binding_id\n",
+ "\n",
+ "\n",
+ " BrentLab/callingcards:\n",
+ " carbon_source: \n",
+ " path: media.carbon_source.compound\n",
+ " temperature_celsius: \n",
+ " path: temperature_celsius\n",
+ " dataset:\n",
+ " annotated_features:\n",
+ " id:\n",
+ " field: id\n",
+ " regulator_locus_tag:\n",
+ " field: target_locus_tag\n",
+ " regulator_symbol:\n",
+ " field: target_symbol\n",
+ " \n",
+ " comparative_analyses:\n",
+ " - repo: BrentLab/yeast_comparative_analysis\n",
+ " dataset: dto\n",
+ " via_field: binding_id\n",
+ " \n",
+ " BrentLab/hackett_2020:\n",
+ " dataset:\n",
+ " hackett_2020:\n",
+ " sample_id:\n",
+ " field: sample_id\n",
+ " dtype: numeric\n",
+ " regulator_locus_tag:\n",
+ " field: regulator_locus_tag\n",
+ " temperature_celsius:\n",
+ " path: temperature_celsius\n",
+ " dtype: numeric\n",
+ " cultivation_method:\n",
+ " path: cultivation_method\n",
+ " media_name:\n",
+ " path: media.name\n",
+ " induction_system:\n",
+ " field: mechanism\n",
+ " inducer_compound:\n",
+ " field: mechanism\n",
+ " path: definitions.inducer\n",
+ " nutrient_restriction:\n",
+ " field: restriction\n",
+ " log2fc:\n",
+ " field: log2_shrunken_timecourses\n",
+ " dtype: numeric\n",
+ " log2_raw_ratio:\n",
+ " field: log2_ratio\n",
+ " dtype: numeric\n",
+ " time_point:\n",
+ " field: time\n",
+ " dtype: numeric\n",
+ "\n",
+ " comparative_analyses:\n",
+ " - repo: BrentLab/yeast_comparative_analysis\n",
+ " dataset: dto\n",
+ " via_field: perturbation_id\n",
+ "\n",
+ " BrentLab/kemmeren_2014:\n",
+ " dataset:\n",
+ " kemmeren_2014:\n",
+ " sample_id:\n",
+ " field: sample_id\n",
+ " carbon_source:\n",
+ " path: media.carbon_source.compound\n",
+ " temperature_celsius:\n",
+ " path: temperature_celsius\n",
+ " dtype: numeric\n",
+ "\n",
+ " comparative_analyses:\n",
+ " - repo: BrentLab/yeast_comparative_analysis\n",
+ " dataset: dto\n",
+ " via_field: perturbation_id\n",
+ "\n",
+ " BrentLab/yeast_comparative_analysis:\n",
+ " dataset:\n",
+ " dto:\n",
+ " binding_id:\n",
+ " field: binding_id\n",
+ " perturbation_id:\n",
+ " field: perturbation_id\n",
+ " fdr:\n",
+ " field: dto_fdr\n",
+ " dtype: numeric\n",
+ " pvalue:\n",
+ " field: dto_empirical_pvalue\n",
+ " dtype: numeric\n",
+ " binding_threshold:\n",
+ " field: binding_rank_threshold\n",
+ " dtype: numeric\n",
+ " perturbation_threshold:\n",
+ " field: perturbation_rank_threshold\n",
+ " dtype: numeric\n",
+ " binding_set_size:\n",
+ " field: binding_set_size\n",
+ " dtype: numeric\n",
+ " perturbation_set_size:\n",
+ " field: perturbation_set_size\n",
+ " dtype: numeric\n",
+ "\n",
+ "factor_aliases:\n",
+ " carbon_source:\n",
+ " glucose: [D-glucose, dextrose, glu]\n",
+ " galactose: [D-galactose, gal]\n",
+ " raffinose: [D-raffinose]\n",
+ "\n",
+ "missing_value_labels:\n",
+ " carbon_source: \"unspecified\"\n",
+ "\n",
+ "description:\n",
+ " carbon_source: The carbon source provided during growth\n",
+ " temperature_celsius: Growth temperature in degrees Celsius\n",
+ " environmental_condition: Named environmental condition\n",
+ "\"\"\"\n",
+ "\n",
+ "# Save the configuration to a temporary file\n",
+ "temp_config = Path(tempfile.mkdtemp()) / \"vdb_config.yaml\"\n",
+ "temp_config.write_text(config_yaml)\n",
+ "\n",
+ "print(f\"\u2705 Configuration file saved at: {temp_config}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u2705 VirtualDB initialized successfully!\n",
+ "Number of configured repositories: 7\n",
+ "\n",
+ "Configured datasets:\n",
+ " - BrentLab/harbison_2004/harbison_2004\n",
+ " - BrentLab/rossi_2021/rossi_2021_af_combined\n",
+ " - BrentLab/mahendrawada_2025/reprocessed_diffcontrol_5prime\n",
+ " - BrentLab/callingcards/annotated_features\n",
+ " - BrentLab/hackett_2020/hackett_2020\n",
+ " - BrentLab/kemmeren_2014/kemmeren_2014\n",
+ " - BrentLab/yeast_comparative_analysis/dto\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Initialize VirtualDB\n",
+ "from tfbpapi.virtual_db import VirtualDB\n",
+ "\n",
+ "# Token authentication required\n",
+ "hf_token = \"\"\n",
+ "\n",
+ "vdb = VirtualDB(str(temp_config), token=hf_token)\n",
+ "\n",
+ "print(\"\u2705 VirtualDB initialized successfully!\")\n",
+ "print(f\"Number of configured repositories: {len(vdb.config.repositories)}\")\n",
+ "\n",
+ "# List all configured datasets\n",
+ "print(\"\\nConfigured datasets:\")\n",
+ "for repo_id, repo_config in vdb.config.repositories.items():\n",
+ " if repo_config.dataset:\n",
+ " for config_name in repo_config.dataset.keys():\n",
+ " print(f\" - {repo_id}/{config_name}\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Added the ability to perform comparative analysis to the query."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Fetching 6 files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00<00:00, 51569.31it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sample_id | \n",
+ " carbon_source | \n",
+ " temperature_celsius | \n",
+ " pvalue | \n",
+ " perturbation_id | \n",
+ " dataset_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " 0.301 | \n",
+ " BrentLab/kemmeren_2014;kemmeren_2014;18 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " NaN | \n",
+ " BrentLab/Hackett_2020;hackett_2020;33 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " 0.512 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;34 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " 0.306 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;40 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " NaN | \n",
+ " BrentLab/Hackett_2020;hackett_2020;37 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " 0.309 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;38 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " 0.644 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;36 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " 0.411 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;35 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " 0.536 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;39 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sample_id carbon_source temperature_celsius pvalue \\\n",
+ "0 1 glucose 30 NaN \n",
+ "1 2 glucose 30 0.301 \n",
+ "2 2 glucose 30 NaN \n",
+ "3 2 glucose 30 0.512 \n",
+ "4 2 glucose 30 0.306 \n",
+ "5 2 glucose 30 NaN \n",
+ "6 2 glucose 30 0.309 \n",
+ "7 2 glucose 30 0.644 \n",
+ "8 2 glucose 30 0.411 \n",
+ "9 2 glucose 30 0.536 \n",
+ "\n",
+ " perturbation_id \\\n",
+ "0 NaN \n",
+ "1 BrentLab/kemmeren_2014;kemmeren_2014;18 \n",
+ "2 BrentLab/Hackett_2020;hackett_2020;33 \n",
+ "3 BrentLab/Hackett_2020;hackett_2020;34 \n",
+ "4 BrentLab/Hackett_2020;hackett_2020;40 \n",
+ "5 BrentLab/Hackett_2020;hackett_2020;37 \n",
+ "6 BrentLab/Hackett_2020;hackett_2020;38 \n",
+ "7 BrentLab/Hackett_2020;hackett_2020;36 \n",
+ "8 BrentLab/Hackett_2020;hackett_2020;35 \n",
+ "9 BrentLab/Hackett_2020;hackett_2020;39 \n",
+ "\n",
+ " dataset_id \n",
+ "0 BrentLab/harbison_2004/harbison_2004 \n",
+ "1 BrentLab/harbison_2004/harbison_2004 \n",
+ "2 BrentLab/harbison_2004/harbison_2004 \n",
+ "3 BrentLab/harbison_2004/harbison_2004 \n",
+ "4 BrentLab/harbison_2004/harbison_2004 \n",
+ "5 BrentLab/harbison_2004/harbison_2004 \n",
+ "6 BrentLab/harbison_2004/harbison_2004 \n",
+ "7 BrentLab/harbison_2004/harbison_2004 \n",
+ "8 BrentLab/harbison_2004/harbison_2004 \n",
+ "9 BrentLab/harbison_2004/harbison_2004 "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "all_p001 = vdb.query(\n",
+ " datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n",
+ " fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\", \"pvalue\", \"perturbation_id\"],\n",
+ ")\n",
+ "\n",
+ "all_p001.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Fetching 6 files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00<00:00, 49152.00it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sample_id | \n",
+ " temperature_celsius | \n",
+ " perturbation_id | \n",
+ " binding_id | \n",
+ " pvalue | \n",
+ " dataset_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 34 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;34 | \n",
+ " BrentLab/callingcards;annotated_features;394 | \n",
+ " 0.010 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 38 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;38 | \n",
+ " BrentLab/callingcards;annotated_features;380 | \n",
+ " 0.003 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 39 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;39 | \n",
+ " BrentLab/callingcards;annotated_features;380 | \n",
+ " 0.000 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 39 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;39 | \n",
+ " BrentLab/callingcards;annotated_features;748 | \n",
+ " 0.000 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 40 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;40 | \n",
+ " BrentLab/callingcards;annotated_features;380 | \n",
+ " 0.000 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 40 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;40 | \n",
+ " BrentLab/callingcards;annotated_features;394 | \n",
+ " 0.000 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 40 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;40 | \n",
+ " BrentLab/callingcards;annotated_features;748 | \n",
+ " 0.000 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 44 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;44 | \n",
+ " BrentLab/callingcards;annotated_features;34 | \n",
+ " 0.000 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 44 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;44 | \n",
+ " BrentLab/harbison_2004;harbison_2004;7 | \n",
+ " 0.009 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 44 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;44 | \n",
+ " BrentLab/harbison_2004;harbison_2004;8 | \n",
+ " 0.007 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sample_id temperature_celsius perturbation_id \\\n",
+ "0 34 30 BrentLab/Hackett_2020;hackett_2020;34 \n",
+ "1 38 30 BrentLab/Hackett_2020;hackett_2020;38 \n",
+ "2 39 30 BrentLab/Hackett_2020;hackett_2020;39 \n",
+ "3 39 30 BrentLab/Hackett_2020;hackett_2020;39 \n",
+ "4 40 30 BrentLab/Hackett_2020;hackett_2020;40 \n",
+ "5 40 30 BrentLab/Hackett_2020;hackett_2020;40 \n",
+ "6 40 30 BrentLab/Hackett_2020;hackett_2020;40 \n",
+ "7 44 30 BrentLab/Hackett_2020;hackett_2020;44 \n",
+ "8 44 30 BrentLab/Hackett_2020;hackett_2020;44 \n",
+ "9 44 30 BrentLab/Hackett_2020;hackett_2020;44 \n",
+ "\n",
+ " binding_id pvalue \\\n",
+ "0 BrentLab/callingcards;annotated_features;394 0.010 \n",
+ "1 BrentLab/callingcards;annotated_features;380 0.003 \n",
+ "2 BrentLab/callingcards;annotated_features;380 0.000 \n",
+ "3 BrentLab/callingcards;annotated_features;748 0.000 \n",
+ "4 BrentLab/callingcards;annotated_features;380 0.000 \n",
+ "5 BrentLab/callingcards;annotated_features;394 0.000 \n",
+ "6 BrentLab/callingcards;annotated_features;748 0.000 \n",
+ "7 BrentLab/callingcards;annotated_features;34 0.000 \n",
+ "8 BrentLab/harbison_2004;harbison_2004;7 0.009 \n",
+ "9 BrentLab/harbison_2004;harbison_2004;8 0.007 \n",
+ "\n",
+ " dataset_id \n",
+ "0 BrentLab/hackett_2020/hackett_2020 \n",
+ "1 BrentLab/hackett_2020/hackett_2020 \n",
+ "2 BrentLab/hackett_2020/hackett_2020 \n",
+ "3 BrentLab/hackett_2020/hackett_2020 \n",
+ "4 BrentLab/hackett_2020/hackett_2020 \n",
+ "5 BrentLab/hackett_2020/hackett_2020 \n",
+ "6 BrentLab/hackett_2020/hackett_2020 \n",
+ "7 BrentLab/hackett_2020/hackett_2020 \n",
+ "8 BrentLab/hackett_2020/hackett_2020 \n",
+ "9 BrentLab/hackett_2020/hackett_2020 "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Even if not specified in the `fields` parameter, the filter options will still be retained within the `fields` parameter.\n",
+ "all_p001 = vdb.query(\n",
+ " datasets=[(\"BrentLab/hackett_2020\", \"hackett_2020\")],\n",
+ " filters={\n",
+ " \"pvalue\": (\"<=\", 0.01) \n",
+ " },\n",
+ " fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\", \"perturbation_id\",\"binding_id\"],\n",
+ ")\n",
+ "\n",
+ "all_p001.head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create a function called query_dto that is specifically responsible for retrieving the DTO data for the specified binding and perturbation datasets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Found 126 DTO records\n",
+ "Column names: ['binding_id', 'perturbation_id', 'pvalue', 'fdr', 'sample_id', 'carbon_source', 'temperature_celsius']\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " binding_id | \n",
+ " perturbation_id | \n",
+ " pvalue | \n",
+ " fdr | \n",
+ " sample_id | \n",
+ " carbon_source | \n",
+ " temperature_celsius | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 12 | \n",
+ " BrentLab/harbison_2004;harbison_2004;3 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;85 | \n",
+ " 0.004 | \n",
+ " 0.000225 | \n",
+ " 3 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " BrentLab/harbison_2004;harbison_2004;3 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;87 | \n",
+ " 0.010 | \n",
+ " 0.000225 | \n",
+ " 3 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " BrentLab/harbison_2004;harbison_2004;3 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;82 | \n",
+ " 0.005 | \n",
+ " 0.000225 | \n",
+ " 3 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 50 | \n",
+ " BrentLab/harbison_2004;harbison_2004;7 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;44 | \n",
+ " 0.009 | \n",
+ " 0.022495 | \n",
+ " 7 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 59 | \n",
+ " BrentLab/harbison_2004;harbison_2004;8 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;44 | \n",
+ " 0.007 | \n",
+ " 0.080057 | \n",
+ " 8 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 61 | \n",
+ " BrentLab/harbison_2004;harbison_2004;8 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;46 | \n",
+ " 0.000 | \n",
+ " 0.062511 | \n",
+ " 8 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 66 | \n",
+ " BrentLab/harbison_2004;harbison_2004;9 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;44 | \n",
+ " 0.006 | \n",
+ " 0.110684 | \n",
+ " 9 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 68 | \n",
+ " BrentLab/harbison_2004;harbison_2004;9 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;48 | \n",
+ " 0.004 | \n",
+ " 0.317610 | \n",
+ " 9 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 71 | \n",
+ " BrentLab/harbison_2004;harbison_2004;9 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;46 | \n",
+ " 0.001 | \n",
+ " 0.117687 | \n",
+ " 9 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 76 | \n",
+ " BrentLab/harbison_2004;harbison_2004;10 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;46 | \n",
+ " 0.003 | \n",
+ " 0.099272 | \n",
+ " 10 | \n",
+ " unspecified | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " binding_id \\\n",
+ "12 BrentLab/harbison_2004;harbison_2004;3 \n",
+ "17 BrentLab/harbison_2004;harbison_2004;3 \n",
+ "18 BrentLab/harbison_2004;harbison_2004;3 \n",
+ "50 BrentLab/harbison_2004;harbison_2004;7 \n",
+ "59 BrentLab/harbison_2004;harbison_2004;8 \n",
+ "61 BrentLab/harbison_2004;harbison_2004;8 \n",
+ "66 BrentLab/harbison_2004;harbison_2004;9 \n",
+ "68 BrentLab/harbison_2004;harbison_2004;9 \n",
+ "71 BrentLab/harbison_2004;harbison_2004;9 \n",
+ "76 BrentLab/harbison_2004;harbison_2004;10 \n",
+ "\n",
+ " perturbation_id pvalue fdr sample_id \\\n",
+ "12 BrentLab/Hackett_2020;hackett_2020;85 0.004 0.000225 3 \n",
+ "17 BrentLab/Hackett_2020;hackett_2020;87 0.010 0.000225 3 \n",
+ "18 BrentLab/Hackett_2020;hackett_2020;82 0.005 0.000225 3 \n",
+ "50 BrentLab/Hackett_2020;hackett_2020;44 0.009 0.022495 7 \n",
+ "59 BrentLab/Hackett_2020;hackett_2020;44 0.007 0.080057 8 \n",
+ "61 BrentLab/Hackett_2020;hackett_2020;46 0.000 0.062511 8 \n",
+ "66 BrentLab/Hackett_2020;hackett_2020;44 0.006 0.110684 9 \n",
+ "68 BrentLab/Hackett_2020;hackett_2020;48 0.004 0.317610 9 \n",
+ "71 BrentLab/Hackett_2020;hackett_2020;46 0.001 0.117687 9 \n",
+ "76 BrentLab/Hackett_2020;hackett_2020;46 0.003 0.099272 10 \n",
+ "\n",
+ " carbon_source temperature_celsius \n",
+ "12 glucose 30 \n",
+ "17 glucose 30 \n",
+ "18 glucose 30 \n",
+ "50 glucose 30 \n",
+ "59 glucose 30 \n",
+ "61 glucose 30 \n",
+ "66 glucose 30 \n",
+ "68 glucose 30 \n",
+ "71 glucose 30 \n",
+ "76 unspecified 30 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Example: Query the intersection of harbison and hackett, filter for pvalue <= 0.01\n",
+ "dto_result = vdb.query_dto(\n",
+ " binding_dataset=(\"BrentLab/harbison_2004\", \"harbison_2004\"),\n",
+ " perturbation_dataset=(\"BrentLab/hackett_2020\", \"hackett_2020\"),\n",
+ " dto_filters={\"pvalue\": (\"<=\", 0.01)},\n",
+ " fields=[\"binding_id\", \"perturbation_id\", \"pvalue\", \"fdr\",\"sample_id\", \"carbon_source\", \"temperature_celsius\"]\n",
+ ")\n",
+ "\n",
+ "print(f\"Found {len(dto_result)} DTO records\")\n",
+ "print(f\"Column names: {list(dto_result.columns)}\")\n",
+ "dto_result.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Fetching 135 files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 135/135 [00:00<00:00, 18401.45it/s]\n",
+ "Query execution failed: Binder Error: Referenced column \"sample_id\" not found in FROM clause!\n",
+ "Candidate bindings: \"callingcards_enrichment\", \"target_symbol\"\n",
+ "\n",
+ "LINE 1: SELECT sample_id FROM metadata_annotated_features LIMIT 1\n",
+ " ^\n",
+ "SQL: SELECT sample_id FROM metadata_annotated_features LIMIT 1\n"
+ ]
+ },
+ {
+ "ename": "",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": []
+ }
+ ],
+ "source": [
+ "all_p001 = vdb.query( \n",
+ " datasets=[(\"BrentLab/callingcards\", \"annotated_features\")],\n",
+ " complete=False\n",
+ ")\n",
+ "all_p001.head()\n",
+ "print(f\"\u603b\u5171\u6709 {len(all_p001)} \u884c\u6570\u636e\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "query in binding or pert,not compara, use vdb function"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/docs/tutorials/show_new_functions.ipynb b/docs/tutorials/show_new_functions.ipynb
new file mode 100644
index 0000000..efd8da3
--- /dev/null
+++ b/docs/tutorials/show_new_functions.ipynb
@@ -0,0 +1,960 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "b5f1facc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import necessary libraries\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from collections import Counter\n",
+ "\n",
+ "# Set display options\n",
+ "pd.set_option('display.max_columns', None)\n",
+ "pd.set_option('display.max_rows', 100)\n",
+ "pd.set_option('display.width', None)\n",
+ "\n",
+ "# Set plot style\n",
+ "plt.style.use('seaborn-v0_8-whitegrid')\n",
+ "sns.set_palette('husl')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "5452d8e5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "✅ Configuration file saved at: /tmp/tmp9lavjul7/vdb_config.yaml\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Create VirtualDB configuration\n",
+ "# This configuration defines how to map the fields of different datasets and how to associate DTO comparative analysis data\n",
+ "\n",
+ "import tempfile\n",
+ "from pathlib import Path\n",
+ "\n",
+ "config_yaml = \"\"\"\n",
+ "repositories:\n",
+ " BrentLab/harbison_2004:\n",
+ " dataset:\n",
+ " harbison_2004:\n",
+ " sample_id:\n",
+ " field: sample_id\n",
+ " carbon_source:\n",
+ " field: condition\n",
+ " path: media.carbon_source.compound\n",
+ " temperature_celsius:\n",
+ " field: condition\n",
+ " path: temperature_celsius\n",
+ " dtype: numeric\n",
+ " environmental_condition:\n",
+ " field: condition\n",
+ " regualtor_locus_tag:\n",
+ " field: regulator_locus_tag\n",
+ " regulator_symbol:\n",
+ " field: regulator_symbol\n",
+ "\n",
+ " comparative_analyses:\n",
+ " - repo: BrentLab/yeast_comparative_analysis\n",
+ " dataset: dto\n",
+ " via_field: binding_id\n",
+ "\n",
+ " BrentLab/rossi_2021:\n",
+ " carbon_source: \n",
+ " path: media.carbon_source.compound\n",
+ " temperature_celsius: \n",
+ " path: temperature_celsius\n",
+ " dataset:\n",
+ " rossi_2021_af_combined:\n",
+ " sample_id: \n",
+ " field: sample_id\n",
+ " regulator_locus_tag:\n",
+ " field: regulator_locus_tag\n",
+ " target_locus_tag:\n",
+ " field: target_locus_tag\n",
+ "\n",
+ " comparative_analyses:\n",
+ " - repo: BrentLab/yeast_comparative_analysis\n",
+ " dataset: dto\n",
+ " via_field: binding_id\n",
+ "\n",
+ " BrentLab/mahendrawada_2025:\n",
+ " dataset:\n",
+ " reprocessed_diffcontrol_5prime:\n",
+ " sample_id:\n",
+ " field: sample_id\n",
+ " control_source:\n",
+ " field: control_source\n",
+ " regulator_locus_tag:\n",
+ " field: regulator_locus_tag\n",
+ " regulator_symbol:\n",
+ " field: regulator_symbol\n",
+ " environmental_condition:\n",
+ " field: condition\n",
+ " temperature_celsius:\n",
+ " field: condition\n",
+ " path: temperature_celsius\n",
+ " dtype: numeric\n",
+ " media_name:\n",
+ " field: condition\n",
+ " path: media.name\n",
+ " carbon_source:\n",
+ " field: condition\n",
+ " path: media.carbon_source\n",
+ "\n",
+ " comparative_analyses:\n",
+ " - repo: BrentLab/yeast_comparative_analysis\n",
+ " dataset: dto\n",
+ " via_field: binding_id\n",
+ "\n",
+ "\n",
+ " BrentLab/callingcards:\n",
+ " carbon_source: \n",
+ " path: media.carbon_source.compound\n",
+ " temperature_celsius: \n",
+ " path: temperature_celsius\n",
+ " dataset:\n",
+ " annotated_features:\n",
+ " id:\n",
+ " field: id\n",
+ " regulator_locus_tag:\n",
+ " field: target_locus_tag\n",
+ " regulator_symbol:\n",
+ " field: target_symbol\n",
+ " \n",
+ " comparative_analyses:\n",
+ " - repo: BrentLab/yeast_comparative_analysis\n",
+ " dataset: dto\n",
+ " via_field: binding_id\n",
+ " \n",
+ " BrentLab/hackett_2020:\n",
+ " dataset:\n",
+ " hackett_2020:\n",
+ " sample_id:\n",
+ " field: sample_id\n",
+ " dtype: numeric\n",
+ " regulator_locus_tag:\n",
+ " field: regulator_locus_tag\n",
+ " temperature_celsius:\n",
+ " path: temperature_celsius\n",
+ " dtype: numeric\n",
+ " cultivation_method:\n",
+ " path: cultivation_method\n",
+ " media_name:\n",
+ " path: media.name\n",
+ " induction_system:\n",
+ " field: mechanism\n",
+ " inducer_compound:\n",
+ " field: mechanism\n",
+ " path: definitions.inducer\n",
+ " nutrient_restriction:\n",
+ " field: restriction\n",
+ " log2fc:\n",
+ " field: log2_shrunken_timecourses\n",
+ " dtype: numeric\n",
+ " log2_raw_ratio:\n",
+ " field: log2_ratio\n",
+ " dtype: numeric\n",
+ " time_point:\n",
+ " field: time\n",
+ " dtype: numeric\n",
+ "\n",
+ " comparative_analyses:\n",
+ " - repo: BrentLab/yeast_comparative_analysis\n",
+ " dataset: dto\n",
+ " via_field: perturbation_id\n",
+ "\n",
+ " BrentLab/kemmeren_2014:\n",
+ " dataset:\n",
+ " kemmeren_2014:\n",
+ " sample_id:\n",
+ " field: sample_id\n",
+ " carbon_source:\n",
+ " path: media.carbon_source.compound\n",
+ " temperature_celsius:\n",
+ " path: temperature_celsius\n",
+ " dtype: numeric\n",
+ "\n",
+ " comparative_analyses:\n",
+ " - repo: BrentLab/yeast_comparative_analysis\n",
+ " dataset: dto\n",
+ " via_field: perturbation_id\n",
+ "\n",
+ " BrentLab/yeast_comparative_analysis:\n",
+ " dataset:\n",
+ " dto:\n",
+ " binding_id:\n",
+ " field: binding_id\n",
+ " perturbation_id:\n",
+ " field: perturbation_id\n",
+ " fdr:\n",
+ " field: dto_fdr\n",
+ " dtype: numeric\n",
+ " pvalue:\n",
+ " field: dto_empirical_pvalue\n",
+ " dtype: numeric\n",
+ " binding_threshold:\n",
+ " field: binding_rank_threshold\n",
+ " dtype: numeric\n",
+ " perturbation_threshold:\n",
+ " field: perturbation_rank_threshold\n",
+ " dtype: numeric\n",
+ " binding_set_size:\n",
+ " field: binding_set_size\n",
+ " dtype: numeric\n",
+ " perturbation_set_size:\n",
+ " field: perturbation_set_size\n",
+ " dtype: numeric\n",
+ "\n",
+ "factor_aliases:\n",
+ " carbon_source:\n",
+ " glucose: [D-glucose, dextrose, glu]\n",
+ " galactose: [D-galactose, gal]\n",
+ " raffinose: [D-raffinose]\n",
+ "\n",
+ "missing_value_labels:\n",
+ " carbon_source: \"unspecified\"\n",
+ "\n",
+ "description:\n",
+ " carbon_source: The carbon source provided during growth\n",
+ " temperature_celsius: Growth temperature in degrees Celsius\n",
+ " environmental_condition: Named environmental condition\n",
+ "\"\"\"\n",
+ "\n",
+ "# Save the configuration to a temporary file\n",
+ "temp_config = Path(tempfile.mkdtemp()) / \"vdb_config.yaml\"\n",
+ "temp_config.write_text(config_yaml)\n",
+ "\n",
+ "print(f\"✅ Configuration file saved at: {temp_config}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "1550d737",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "✅ VirtualDB initialized successfully!\n",
+ "Number of configured repositories: 7\n",
+ "\n",
+ "Configured datasets:\n",
+ " - BrentLab/harbison_2004/harbison_2004\n",
+ " - BrentLab/rossi_2021/rossi_2021_af_combined\n",
+ " - BrentLab/mahendrawada_2025/reprocessed_diffcontrol_5prime\n",
+ " - BrentLab/callingcards/annotated_features\n",
+ " - BrentLab/hackett_2020/hackett_2020\n",
+ " - BrentLab/kemmeren_2014/kemmeren_2014\n",
+ " - BrentLab/yeast_comparative_analysis/dto\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Initialize VirtualDB\n",
+ "import os\n",
+ "from tfbpapi.virtual_db import VirtualDB\n",
+ "\n",
+ "# Token authentication required\n",
+ "hf_token = os.getenv(\"HF_TOKEN\", None)\n",
+ "\n",
+ "vdb = VirtualDB(str(temp_config), token=hf_token)\n",
+ "\n",
+ "print(\"✅ VirtualDB initialized successfully!\")\n",
+ "print(f\"Number of configured repositories: {len(vdb.config.repositories)}\")\n",
+ "\n",
+ "# List all configured datasets\n",
+ "print(\"\\nConfigured datasets:\")\n",
+ "for repo_id, repo_config in vdb.config.repositories.items():\n",
+ " if repo_config.dataset:\n",
+ " for config_name in repo_config.dataset.keys():\n",
+ " print(f\" - {repo_id}/{config_name}\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e6dc4ce8",
+ "metadata": {},
+ "source": [
+ "Added the ability to perform comparative analysis to the query."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "c9b1b241",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 41665.27it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sample_id | \n",
+ " carbon_source | \n",
+ " temperature_celsius | \n",
+ " pvalue | \n",
+ " perturbation_id | \n",
+ " dataset_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " 0.301 | \n",
+ " BrentLab/kemmeren_2014;kemmeren_2014;18 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " NaN | \n",
+ " BrentLab/Hackett_2020;hackett_2020;33 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " 0.512 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;34 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " 0.306 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;40 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " NaN | \n",
+ " BrentLab/Hackett_2020;hackett_2020;37 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " 0.309 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;38 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " 0.644 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;36 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " 0.411 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;35 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 2 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ " 0.536 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;39 | \n",
+ " BrentLab/harbison_2004/harbison_2004 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sample_id carbon_source temperature_celsius pvalue \\\n",
+ "0 1 glucose 30 NaN \n",
+ "1 2 glucose 30 0.301 \n",
+ "2 2 glucose 30 NaN \n",
+ "3 2 glucose 30 0.512 \n",
+ "4 2 glucose 30 0.306 \n",
+ "5 2 glucose 30 NaN \n",
+ "6 2 glucose 30 0.309 \n",
+ "7 2 glucose 30 0.644 \n",
+ "8 2 glucose 30 0.411 \n",
+ "9 2 glucose 30 0.536 \n",
+ "\n",
+ " perturbation_id \\\n",
+ "0 NaN \n",
+ "1 BrentLab/kemmeren_2014;kemmeren_2014;18 \n",
+ "2 BrentLab/Hackett_2020;hackett_2020;33 \n",
+ "3 BrentLab/Hackett_2020;hackett_2020;34 \n",
+ "4 BrentLab/Hackett_2020;hackett_2020;40 \n",
+ "5 BrentLab/Hackett_2020;hackett_2020;37 \n",
+ "6 BrentLab/Hackett_2020;hackett_2020;38 \n",
+ "7 BrentLab/Hackett_2020;hackett_2020;36 \n",
+ "8 BrentLab/Hackett_2020;hackett_2020;35 \n",
+ "9 BrentLab/Hackett_2020;hackett_2020;39 \n",
+ "\n",
+ " dataset_id \n",
+ "0 BrentLab/harbison_2004/harbison_2004 \n",
+ "1 BrentLab/harbison_2004/harbison_2004 \n",
+ "2 BrentLab/harbison_2004/harbison_2004 \n",
+ "3 BrentLab/harbison_2004/harbison_2004 \n",
+ "4 BrentLab/harbison_2004/harbison_2004 \n",
+ "5 BrentLab/harbison_2004/harbison_2004 \n",
+ "6 BrentLab/harbison_2004/harbison_2004 \n",
+ "7 BrentLab/harbison_2004/harbison_2004 \n",
+ "8 BrentLab/harbison_2004/harbison_2004 \n",
+ "9 BrentLab/harbison_2004/harbison_2004 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "all_p001 = vdb.query(\n",
+ " datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n",
+ " fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\", \"pvalue\", \"perturbation_id\"],\n",
+ ")\n",
+ "\n",
+ "all_p001.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "f7f6a7f4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 37729.87it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sample_id | \n",
+ " temperature_celsius | \n",
+ " perturbation_id | \n",
+ " binding_id | \n",
+ " pvalue | \n",
+ " dataset_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 34 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;34 | \n",
+ " BrentLab/callingcards;annotated_features;394 | \n",
+ " 0.010 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 38 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;38 | \n",
+ " BrentLab/callingcards;annotated_features;380 | \n",
+ " 0.003 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 39 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;39 | \n",
+ " BrentLab/callingcards;annotated_features;380 | \n",
+ " 0.000 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 39 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;39 | \n",
+ " BrentLab/callingcards;annotated_features;748 | \n",
+ " 0.000 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 40 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;40 | \n",
+ " BrentLab/callingcards;annotated_features;380 | \n",
+ " 0.000 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 40 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;40 | \n",
+ " BrentLab/callingcards;annotated_features;394 | \n",
+ " 0.000 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 40 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;40 | \n",
+ " BrentLab/callingcards;annotated_features;748 | \n",
+ " 0.000 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 44 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;44 | \n",
+ " BrentLab/callingcards;annotated_features;34 | \n",
+ " 0.000 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 44 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;44 | \n",
+ " BrentLab/harbison_2004;harbison_2004;7 | \n",
+ " 0.009 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 44 | \n",
+ " 30 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;44 | \n",
+ " BrentLab/harbison_2004;harbison_2004;8 | \n",
+ " 0.007 | \n",
+ " BrentLab/hackett_2020/hackett_2020 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sample_id temperature_celsius perturbation_id \\\n",
+ "0 34 30 BrentLab/Hackett_2020;hackett_2020;34 \n",
+ "1 38 30 BrentLab/Hackett_2020;hackett_2020;38 \n",
+ "2 39 30 BrentLab/Hackett_2020;hackett_2020;39 \n",
+ "3 39 30 BrentLab/Hackett_2020;hackett_2020;39 \n",
+ "4 40 30 BrentLab/Hackett_2020;hackett_2020;40 \n",
+ "5 40 30 BrentLab/Hackett_2020;hackett_2020;40 \n",
+ "6 40 30 BrentLab/Hackett_2020;hackett_2020;40 \n",
+ "7 44 30 BrentLab/Hackett_2020;hackett_2020;44 \n",
+ "8 44 30 BrentLab/Hackett_2020;hackett_2020;44 \n",
+ "9 44 30 BrentLab/Hackett_2020;hackett_2020;44 \n",
+ "\n",
+ " binding_id pvalue \\\n",
+ "0 BrentLab/callingcards;annotated_features;394 0.010 \n",
+ "1 BrentLab/callingcards;annotated_features;380 0.003 \n",
+ "2 BrentLab/callingcards;annotated_features;380 0.000 \n",
+ "3 BrentLab/callingcards;annotated_features;748 0.000 \n",
+ "4 BrentLab/callingcards;annotated_features;380 0.000 \n",
+ "5 BrentLab/callingcards;annotated_features;394 0.000 \n",
+ "6 BrentLab/callingcards;annotated_features;748 0.000 \n",
+ "7 BrentLab/callingcards;annotated_features;34 0.000 \n",
+ "8 BrentLab/harbison_2004;harbison_2004;7 0.009 \n",
+ "9 BrentLab/harbison_2004;harbison_2004;8 0.007 \n",
+ "\n",
+ " dataset_id \n",
+ "0 BrentLab/hackett_2020/hackett_2020 \n",
+ "1 BrentLab/hackett_2020/hackett_2020 \n",
+ "2 BrentLab/hackett_2020/hackett_2020 \n",
+ "3 BrentLab/hackett_2020/hackett_2020 \n",
+ "4 BrentLab/hackett_2020/hackett_2020 \n",
+ "5 BrentLab/hackett_2020/hackett_2020 \n",
+ "6 BrentLab/hackett_2020/hackett_2020 \n",
+ "7 BrentLab/hackett_2020/hackett_2020 \n",
+ "8 BrentLab/hackett_2020/hackett_2020 \n",
+ "9 BrentLab/hackett_2020/hackett_2020 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Even if not specified in the `fields` parameter, the filter options will still be retained within the `fields` parameter.\n",
+ "all_p001 = vdb.query(\n",
+ " datasets=[(\"BrentLab/hackett_2020\", \"hackett_2020\")],\n",
+ " filters={\n",
+ " \"pvalue\": (\"<=\", 0.01) \n",
+ " },\n",
+ " fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\", \"perturbation_id\",\"binding_id\"],\n",
+ ")\n",
+ "\n",
+ "all_p001.head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5bd97850",
+ "metadata": {},
+ "source": [
+ "Create a function called query_dto that is specifically responsible for retrieving the DTO data for the specified binding and perturbation datasets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "20864108",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Found 126 DTO records\n",
+ "Column names: ['binding_id', 'perturbation_id', 'pvalue', 'fdr', 'sample_id', 'carbon_source', 'temperature_celsius']\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " binding_id | \n",
+ " perturbation_id | \n",
+ " pvalue | \n",
+ " fdr | \n",
+ " sample_id | \n",
+ " carbon_source | \n",
+ " temperature_celsius | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 12 | \n",
+ " BrentLab/harbison_2004;harbison_2004;3 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;85 | \n",
+ " 0.004 | \n",
+ " 0.000225 | \n",
+ " 3 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " BrentLab/harbison_2004;harbison_2004;3 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;87 | \n",
+ " 0.010 | \n",
+ " 0.000225 | \n",
+ " 3 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " BrentLab/harbison_2004;harbison_2004;3 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;82 | \n",
+ " 0.005 | \n",
+ " 0.000225 | \n",
+ " 3 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 50 | \n",
+ " BrentLab/harbison_2004;harbison_2004;7 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;44 | \n",
+ " 0.009 | \n",
+ " 0.022495 | \n",
+ " 7 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 59 | \n",
+ " BrentLab/harbison_2004;harbison_2004;8 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;44 | \n",
+ " 0.007 | \n",
+ " 0.080057 | \n",
+ " 8 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 61 | \n",
+ " BrentLab/harbison_2004;harbison_2004;8 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;46 | \n",
+ " 0.000 | \n",
+ " 0.062511 | \n",
+ " 8 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 66 | \n",
+ " BrentLab/harbison_2004;harbison_2004;9 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;44 | \n",
+ " 0.006 | \n",
+ " 0.110684 | \n",
+ " 9 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 68 | \n",
+ " BrentLab/harbison_2004;harbison_2004;9 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;48 | \n",
+ " 0.004 | \n",
+ " 0.317610 | \n",
+ " 9 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 71 | \n",
+ " BrentLab/harbison_2004;harbison_2004;9 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;46 | \n",
+ " 0.001 | \n",
+ " 0.117687 | \n",
+ " 9 | \n",
+ " glucose | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 76 | \n",
+ " BrentLab/harbison_2004;harbison_2004;10 | \n",
+ " BrentLab/Hackett_2020;hackett_2020;46 | \n",
+ " 0.003 | \n",
+ " 0.099272 | \n",
+ " 10 | \n",
+ " unspecified | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " binding_id \\\n",
+ "12 BrentLab/harbison_2004;harbison_2004;3 \n",
+ "17 BrentLab/harbison_2004;harbison_2004;3 \n",
+ "18 BrentLab/harbison_2004;harbison_2004;3 \n",
+ "50 BrentLab/harbison_2004;harbison_2004;7 \n",
+ "59 BrentLab/harbison_2004;harbison_2004;8 \n",
+ "61 BrentLab/harbison_2004;harbison_2004;8 \n",
+ "66 BrentLab/harbison_2004;harbison_2004;9 \n",
+ "68 BrentLab/harbison_2004;harbison_2004;9 \n",
+ "71 BrentLab/harbison_2004;harbison_2004;9 \n",
+ "76 BrentLab/harbison_2004;harbison_2004;10 \n",
+ "\n",
+ " perturbation_id pvalue fdr sample_id \\\n",
+ "12 BrentLab/Hackett_2020;hackett_2020;85 0.004 0.000225 3 \n",
+ "17 BrentLab/Hackett_2020;hackett_2020;87 0.010 0.000225 3 \n",
+ "18 BrentLab/Hackett_2020;hackett_2020;82 0.005 0.000225 3 \n",
+ "50 BrentLab/Hackett_2020;hackett_2020;44 0.009 0.022495 7 \n",
+ "59 BrentLab/Hackett_2020;hackett_2020;44 0.007 0.080057 8 \n",
+ "61 BrentLab/Hackett_2020;hackett_2020;46 0.000 0.062511 8 \n",
+ "66 BrentLab/Hackett_2020;hackett_2020;44 0.006 0.110684 9 \n",
+ "68 BrentLab/Hackett_2020;hackett_2020;48 0.004 0.317610 9 \n",
+ "71 BrentLab/Hackett_2020;hackett_2020;46 0.001 0.117687 9 \n",
+ "76 BrentLab/Hackett_2020;hackett_2020;46 0.003 0.099272 10 \n",
+ "\n",
+ " carbon_source temperature_celsius \n",
+ "12 glucose 30 \n",
+ "17 glucose 30 \n",
+ "18 glucose 30 \n",
+ "50 glucose 30 \n",
+ "59 glucose 30 \n",
+ "61 glucose 30 \n",
+ "66 glucose 30 \n",
+ "68 glucose 30 \n",
+ "71 glucose 30 \n",
+ "76 unspecified 30 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Example: Query the intersection of harbison and hackett, filter for pvalue <= 0.01\n",
+ "dto_result = vdb.query_dto(\n",
+ " binding_dataset=(\"BrentLab/harbison_2004\", \"harbison_2004\"),\n",
+ " perturbation_dataset=(\"BrentLab/hackett_2020\", \"hackett_2020\"),\n",
+ " dto_filters={\"pvalue\": (\"<=\", 0.01)},\n",
+ " fields=[\"binding_id\", \"perturbation_id\", \"pvalue\", \"fdr\",\"sample_id\", \"carbon_source\", \"temperature_celsius\"]\n",
+ ")\n",
+ "\n",
+ "print(f\"Found {len(dto_result)} DTO records\")\n",
+ "print(f\"Column names: {list(dto_result.columns)}\")\n",
+ "dto_result.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "15f63f8a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Query execution failed: Binder Error: Referenced column \"sample_id\" not found in FROM clause!\n",
+ "Candidate bindings: \"callingcards_enrichment\", \"target_symbol\"\n",
+ "\n",
+ "LINE 1: SELECT sample_id FROM metadata_annotated_features LIMIT 1\n",
+ " ^\n",
+ "SQL: SELECT sample_id FROM metadata_annotated_features LIMIT 1\n"
+ ]
+ },
+ {
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+ "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m all_p001 = \u001b[43mvdb\u001b[49m\u001b[43m.\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\n\u001b[32m 2\u001b[39m \u001b[43m \u001b[49m\u001b[43mdatasets\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mBrentLab/callingcards\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mannotated_features\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[43m \u001b[49m\u001b[43mcomplete\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[32m 4\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m 5\u001b[39m all_p001.head()\n\u001b[32m 6\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mTotal number of rows: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(all_p001)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
+ "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/tfbpapi/virtual_db.py:406\u001b[39m, in \u001b[36mVirtualDB.query\u001b[39m\u001b[34m(self, filters, datasets, fields, complete)\u001b[39m\n\u001b[32m 403\u001b[39m results: \u001b[38;5;28mlist\u001b[39m[pd.DataFrame] = []\n\u001b[32m 404\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m repo_id, config_name \u001b[38;5;129;01min\u001b[39;00m datasets:\n\u001b[32m 405\u001b[39m \u001b[38;5;66;03m# Build metadata table\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m406\u001b[39m metadata_df = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_build_metadata_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 407\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m metadata_df.empty:\n\u001b[32m 408\u001b[39m \u001b[38;5;28;01mcontinue\u001b[39;00m\n",
+ "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/tfbpapi/virtual_db.py:690\u001b[39m, in \u001b[36mVirtualDB._build_metadata_table\u001b[39m\u001b[34m(self, repo_id, config_name, use_cache)\u001b[39m\n\u001b[32m 688\u001b[39m \u001b[38;5;66;03m# If sample_id doesn't exist, generate from row number\u001b[39;00m\n\u001b[32m 689\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33msample_id\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m df.columns \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m df.empty:\n\u001b[32m--> \u001b[39m\u001b[32m690\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33msample_id\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[43mdf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m.\u001b[49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 692\u001b[39m \u001b[38;5;66;03m# One row per sample_id\u001b[39;00m\n\u001b[32m 693\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33msample_id\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m df.columns:\n",
+ "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/.venv/lib/python3.11/site-packages/pandas/core/indexes/base.py:1104\u001b[39m, in \u001b[36mIndex.astype\u001b[39m\u001b[34m(self, dtype, copy)\u001b[39m\n\u001b[32m 1100\u001b[39m new_values = \u001b[38;5;28mcls\u001b[39m._from_sequence(\u001b[38;5;28mself\u001b[39m, dtype=dtype, copy=copy)\n\u001b[32m 1102\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1103\u001b[39m \u001b[38;5;66;03m# GH#13149 specifically use astype_array instead of astype\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1104\u001b[39m new_values = \u001b[43mastype_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1106\u001b[39m \u001b[38;5;66;03m# pass copy=False because any copying will be done in the astype above\u001b[39;00m\n\u001b[32m 1107\u001b[39m result = Index(new_values, name=\u001b[38;5;28mself\u001b[39m.name, dtype=new_values.dtype, copy=\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
+ "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:182\u001b[39m, in \u001b[36mastype_array\u001b[39m\u001b[34m(values, dtype, copy)\u001b[39m\n\u001b[32m 179\u001b[39m values = values.astype(dtype, copy=copy)\n\u001b[32m 181\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m182\u001b[39m values = \u001b[43m_astype_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 184\u001b[39m \u001b[38;5;66;03m# in pandas we don't store numpy str dtypes, so convert to object\u001b[39;00m\n\u001b[32m 185\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dtype, np.dtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values.dtype.type, \u001b[38;5;28mstr\u001b[39m):\n",
+ "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:96\u001b[39m, in \u001b[36m_astype_nansafe\u001b[39m\u001b[34m(arr, dtype, copy, skipna)\u001b[39m\n\u001b[32m 94\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m arr.ndim > \u001b[32m1\u001b[39m:\n\u001b[32m 95\u001b[39m arr = arr.ravel()\n\u001b[32m---> \u001b[39m\u001b[32m96\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[43m.\u001b[49m\u001b[43mensure_string_array\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 97\u001b[39m \u001b[43m \u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m=\u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert_na_value\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[32m 98\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m.reshape(shape)\n\u001b[32m 100\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m np.issubdtype(arr.dtype, np.floating) \u001b[38;5;129;01mand\u001b[39;00m dtype.kind \u001b[38;5;129;01min\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33miu\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 101\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m _astype_float_to_int_nansafe(arr, dtype, copy)\n",
+ "\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/lib.pyx:718\u001b[39m, in \u001b[36mpandas._libs.lib.ensure_string_array\u001b[39m\u001b[34m()\u001b[39m\n",
+ "\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/lib.pyx:832\u001b[39m, in \u001b[36mpandas._libs.lib.ensure_string_array\u001b[39m\u001b[34m()\u001b[39m\n",
+ "\u001b[31mKeyboardInterrupt\u001b[39m: "
+ ]
+ }
+ ],
+ "source": [
+ "all_p001 = vdb.query( \n",
+ " datasets=[(\"BrentLab/callingcards\", \"annotated_features\")],\n",
+ " complete=False\n",
+ ")\n",
+ "all_p001.head()\n",
+ "print(f\"Total number of rows: {len(all_p001)}\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tfbpapi/tests/test_virtual_db.py b/tfbpapi/tests/test_virtual_db.py
index 1293bf9..f0abf1c 100644
--- a/tfbpapi/tests/test_virtual_db.py
+++ b/tfbpapi/tests/test_virtual_db.py
@@ -509,185 +509,6 @@ def test_parse_composite_identifier_invalid(self):
with pytest.raises(ValueError, match="Invalid composite ID format"):
VirtualDB._parse_composite_identifier("invalid:format")
- def test_get_comparative_fields_for_dataset(self):
- """Test getting comparative fields mapping."""
- with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
- config = {
- "repositories": {
- "BrentLab/primary": {
- "dataset": {
- "primary_data": {
- "sample_id": {"field": "sample_id"},
- "comparative_analyses": [
- {
- "repo": "BrentLab/comparative",
- "dataset": "comp_data",
- "via_field": "binding_id",
- }
- ],
- }
- }
- },
- "BrentLab/comparative": {
- "dataset": {
- "comp_data": {
- "dto_fdr": {"field": "dto_fdr"},
- "dto_pvalue": {"field": "dto_empirical_pvalue"},
- }
- }
- },
- }
- }
- yaml.dump(config, f)
- config_path = f.name
-
- try:
- vdb = VirtualDB(config_path)
- field_mapping = vdb._get_comparative_fields_for_dataset(
- "BrentLab/primary", "primary_data"
- )
-
- # Should have dto_fdr and dto_pvalue, but NOT binding_id (via_field)
- assert "dto_fdr" in field_mapping
- assert "dto_pvalue" in field_mapping
- assert "binding_id" not in field_mapping
-
- # Check mapping structure
- assert field_mapping["dto_fdr"]["comp_repo"] == "BrentLab/comparative"
- assert field_mapping["dto_fdr"]["comp_dataset"] == "comp_data"
- assert field_mapping["dto_fdr"]["via_field"] == "binding_id"
- finally:
- Path(config_path).unlink()
-
- def test_get_comparative_fields_no_links(self):
- """Test that datasets without comparative links return empty mapping."""
- with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
- config = {
- "repositories": {
- "BrentLab/primary": {
- "dataset": {
- "primary_data": {"sample_id": {"field": "sample_id"}}
- }
- }
- }
- }
- yaml.dump(config, f)
- config_path = f.name
-
- try:
- vdb = VirtualDB(config_path)
- field_mapping = vdb._get_comparative_fields_for_dataset(
- "BrentLab/primary", "primary_data"
- )
- assert field_mapping == {}
- finally:
- Path(config_path).unlink()
-
- def test_get_comparative_analyses(self):
- """Test getting comparative analysis relationships."""
- with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
- config = {
- "repositories": {
- "BrentLab/primary": {
- "dataset": {
- "primary_data": {
- "sample_id": {"field": "sample_id"},
- "comparative_analyses": [
- {
- "repo": "BrentLab/comparative",
- "dataset": "comp_data",
- "via_field": "binding_id",
- }
- ],
- }
- }
- },
- "BrentLab/comparative": {
- "dataset": {"comp_data": {"dto_fdr": {"field": "dto_fdr"}}}
- },
- }
- }
- yaml.dump(config, f)
- config_path = f.name
-
- try:
- vdb = VirtualDB(config_path)
- info = vdb.get_comparative_analyses()
-
- # Check primary to comparative mapping
- assert "BrentLab/primary/primary_data" in info["primary_to_comparative"]
- links = info["primary_to_comparative"]["BrentLab/primary/primary_data"]
- assert len(links) == 1
- assert links[0]["comparative_repo"] == "BrentLab/comparative"
- assert links[0]["comparative_dataset"] == "comp_data"
- assert links[0]["via_field"] == "binding_id"
-
- # Check comparative fields
- assert "BrentLab/comparative/comp_data" in info["comparative_fields"]
- assert (
- "dto_fdr"
- in info["comparative_fields"]["BrentLab/comparative/comp_data"]
- )
- finally:
- Path(config_path).unlink()
-
- def test_get_comparative_analyses_filtered(self):
- """Test filtering comparative analyses by repo and config."""
- with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
- config = {
- "repositories": {
- "BrentLab/primary1": {
- "dataset": {
- "data1": {
- "sample_id": {"field": "sample_id"},
- "comparative_analyses": [
- {
- "repo": "BrentLab/comp",
- "dataset": "comp_data",
- "via_field": "id1",
- }
- ],
- }
- }
- },
- "BrentLab/primary2": {
- "dataset": {
- "data2": {
- "sample_id": {"field": "sample_id"},
- "comparative_analyses": [
- {
- "repo": "BrentLab/comp",
- "dataset": "comp_data",
- "via_field": "id2",
- }
- ],
- }
- }
- },
- }
- }
- yaml.dump(config, f)
- config_path = f.name
-
- try:
- vdb = VirtualDB(config_path)
-
- # Get all
- all_info = vdb.get_comparative_analyses()
- assert len(all_info["primary_to_comparative"]) == 2
-
- # Filter by repo and config
- filtered = vdb.get_comparative_analyses("BrentLab/primary1", "data1")
- assert len(filtered["primary_to_comparative"]) == 1
- assert "BrentLab/primary1/data1" in filtered["primary_to_comparative"]
-
- # Filter by repo only
- repo_filtered = vdb.get_comparative_analyses("BrentLab/primary2")
- assert len(repo_filtered["primary_to_comparative"]) == 1
- assert "BrentLab/primary2/data2" in repo_filtered["primary_to_comparative"]
- finally:
- Path(config_path).unlink()
-
# Note: Full integration tests with real HuggingFace datasets would go here
# but are excluded as they require network access and specific test datasets.
diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py
index f6dd12e..80992b5 100644
--- a/tfbpapi/virtual_db.py
+++ b/tfbpapi/virtual_db.py
@@ -64,9 +64,15 @@ def get_nested_value(data: dict, path: str) -> Any:
List of dicts - extract property from each item:
get_nested_value(
- {"media": {"carbon_source": [{"compound": "glucose"},
- {"compound": "galactose"}]}},
- "media.carbon_source.compound"
+ {
+ "media": {
+ "carbon_source": [
+ {"compound": "glucose"},
+ {"compound": "galactose"},
+ ]
+ }
+ },
+ "media.carbon_source.compound",
)
Returns: ["glucose", "galactose"]
@@ -193,8 +199,6 @@ def __init__(self, config_path: Path | str, token: str | None = None):
self.config = MetadataConfig.from_yaml(config_path)
self.token = token
self.cache: dict[tuple[str, str], pd.DataFrame] = {}
- # Build mapping of comparative dataset references
- self._comparative_links = self._build_comparative_links()
def get_fields(
self, repo_id: str | None = None, config_name: str | None = None
@@ -202,6 +206,8 @@ def get_fields(
"""
Get list of queryable fields.
+ Includes fields from comparative analyses if configured.
+
:param repo_id: Optional repository ID to filter to specific dataset
:param config_name: Optional config name (required if repo_id provided)
:return: List of field names
@@ -217,7 +223,23 @@ def get_fields(
if repo_id is not None and config_name is not None:
# Get fields for specific dataset
mappings = self.config.get_property_mappings(repo_id, config_name)
- return sorted(mappings.keys())
+ fields = set(mappings.keys())
+
+ # Add fields from comparative analyses
+ repo_config = self.config.get_repository_config(repo_id)
+ if repo_config and repo_config.dataset:
+ dataset_config = repo_config.dataset.get(config_name)
+ if dataset_config and dataset_config.comparative_analyses:
+ for comp_analysis in dataset_config.comparative_analyses:
+ comp_mappings = self.config.get_property_mappings(
+ comp_analysis.repo, comp_analysis.dataset
+ )
+ # Add comparative fields (exclude via_field)
+ for field in comp_mappings.keys():
+ if field != comp_analysis.via_field:
+ fields.add(field)
+
+ return sorted(fields)
if repo_id is not None or config_name is not None:
raise ValueError(
@@ -231,16 +253,21 @@ def get_fields(
all_fields.update(repo_config.properties.keys())
# Add dataset-specific fields
if repo_config.dataset:
- for dataset_config in repo_config.dataset.values():
- # DatasetVirtualDBConfig stores property mappings in model_extra
- if (
- hasattr(dataset_config, "model_extra")
- and dataset_config.model_extra
- ):
- all_fields.update(dataset_config.model_extra.keys())
- # Also include special fields if they exist
- if dataset_config.sample_id:
- all_fields.add("sample_id")
+ for config_name, dataset_config in repo_config.dataset.items():
+ # Get property mappings (excludes comparative_analyses)
+ mappings = self.config.get_property_mappings(repo_id, config_name)
+ all_fields.update(mappings.keys())
+
+ # Add fields from comparative analyses
+ if dataset_config.comparative_analyses:
+ for comp_analysis in dataset_config.comparative_analyses:
+ comp_mappings = self.config.get_property_mappings(
+ comp_analysis.repo, comp_analysis.dataset
+ )
+ # Add comparative fields (exclude via_field)
+ for field in comp_mappings.keys():
+ if field != comp_analysis.via_field:
+ all_fields.add(field)
return sorted(all_fields)
@@ -326,101 +353,6 @@ def get_unique_values(
else:
return sorted(all_values)
- def get_comparative_analyses(
- self, repo_id: str | None = None, config_name: str | None = None
- ) -> dict[str, Any]:
- """
- Get information about comparative analysis relationships.
-
- Returns information about which comparative datasets are available
- and how they link to primary datasets. Useful for discovering
- what cross-dataset analyses can be performed.
-
- :param repo_id: Optional repository ID to filter to specific repo
- :param config_name: Optional config name (requires repo_id)
- :return: Dictionary with two keys:
- - "primary_to_comparative": Maps primary datasets to their
- comparative analyses
- - "comparative_fields": Maps comparative datasets to fields
- available for joining
- :raises ValueError: If config_name provided without repo_id
-
- Examples:
- Get all comparative analysis relationships:
- info = vdb.get_comparative_analyses()
-
- Get relationships for specific primary dataset:
- info = vdb.get_comparative_analyses(
- "BrentLab/callingcards", "annotated_features"
- )
-
- """
- if config_name and not repo_id:
- raise ValueError("repo_id required when config_name is specified")
-
- primary_to_comparative: dict[str, list[dict[str, str]]] = {}
- comparative_fields: dict[str, list[str]] = {}
-
- # Filter links based on parameters
- if repo_id and config_name:
- # Specific dataset requested
- links_to_process = {
- (repo_id, config_name): self._comparative_links.get(
- (repo_id, config_name), {}
- )
- }
- elif repo_id:
- # All configs in specific repo
- links_to_process = {
- k: v for k, v in self._comparative_links.items() if k[0] == repo_id
- }
- else:
- # All links
- links_to_process = self._comparative_links
-
- # Build primary to comparative mapping
- for (prim_repo, prim_config), link_info in links_to_process.items():
- if "comparative_analyses" not in link_info:
- continue
-
- dataset_key = f"{prim_repo}/{prim_config}"
- primary_to_comparative[dataset_key] = []
-
- for ca in link_info["comparative_analyses"]:
- primary_to_comparative[dataset_key].append(
- {
- "comparative_repo": ca["repo"],
- "comparative_dataset": ca["dataset"],
- "via_field": ca["via_field"],
- }
- )
-
- # Track which fields are available from comparative datasets
- comp_key = f"{ca['repo']}/{ca['dataset']}"
- if comp_key not in comparative_fields:
- # Get fields from the comparative dataset
- # First try config mappings
- comp_fields = self.get_fields(ca["repo"], ca["dataset"])
-
- # If no mappings, get actual fields from DataCard
- if not comp_fields:
- try:
- card = DataCard(ca["repo"], token=self.token)
- config = card.get_config(ca["dataset"])
- if config and config.dataset_info:
- comp_fields = [
- f.name for f in config.dataset_info.features
- ]
- except Exception:
- comp_fields = []
-
- comparative_fields[comp_key] = comp_fields
-
- return {
- "primary_to_comparative": primary_to_comparative,
- "comparative_fields": comparative_fields,
- }
-
def query(
self,
filters: dict[str, Any] | None = None,
@@ -475,59 +407,13 @@ def query(
if metadata_df.empty:
continue
- # Separate filters into primary and comparative
- primary_filters = {}
- comparative_filters = {}
+ # Apply filters
if filters:
- # Get comparative field mapping
- comp_field_mapping = self._get_comparative_fields_for_dataset(
- repo_id, config_name
- )
- for field, value in filters.items():
- if field in comp_field_mapping:
- comparative_filters[field] = value
- else:
- primary_filters[field] = value
-
- # Apply primary filters first
- if primary_filters:
- metadata_df = self._apply_filters(
- metadata_df, primary_filters, repo_id, config_name
- )
-
- # Enrich with comparative data if needed
- # IMPORTANT: Do this BEFORE getting complete data so comparative fields
- # are joined at the sample level, not measurement level
- # This happens when: fields are requested from comparative datasets
- # OR when filtering on comparative fields
- if fields or comparative_filters:
- comp_field_mapping = self._get_comparative_fields_for_dataset(
- repo_id, config_name
- )
- if fields:
- requested_comp_fields = [
- f for f in fields if f in comp_field_mapping
- ]
- # Also need fields that are filtered on
- filtered_comp_fields = [
- f for f in comparative_filters.keys() if f in comp_field_mapping
- ]
- all_comp_fields = list(
- set(requested_comp_fields + filtered_comp_fields)
- )
- if all_comp_fields:
- metadata_df = self._enrich_with_comparative_data(
- metadata_df, repo_id, config_name, all_comp_fields
- )
-
- # Apply comparative filters after enrichment
- if comparative_filters:
metadata_df = self._apply_filters(
- metadata_df, comparative_filters, repo_id, config_name
+ metadata_df, filters, repo_id, config_name
)
# If complete=True, join with full data
- # Do this AFTER comparative enrichment so DTO fields are already added
if complete:
sample_ids = metadata_df["sample_id"].tolist()
if sample_ids:
@@ -547,9 +433,21 @@ def query(
for field in fields:
if field in metadata_df.columns and field not in keep_cols:
keep_cols.append(field)
+
+ # IMPORTANT: Also include fields used in filters
+ # This ensures that filtered fields are always returned,
+ # even if not in fields parameter
+ if filters:
+ for filter_field in filters.keys():
+ if (
+ filter_field in metadata_df.columns
+ and filter_field not in keep_cols
+ ):
+ keep_cols.append(filter_field)
+
metadata_df = metadata_df[keep_cols].copy()
- # Add dataset identifier
+ # Add dataset identifier (ensure copy before modifying)
if "dataset_id" not in metadata_df.columns:
metadata_df = metadata_df.copy()
metadata_df["dataset_id"] = f"{repo_id}/{config_name}"
@@ -562,6 +460,117 @@ def query(
# Concatenate results, filling NaN for missing columns
return pd.concat(results, ignore_index=True, sort=False)
+ def query_dto(
+ self,
+ binding_dataset: tuple[str, str],
+ perturbation_dataset: tuple[str, str],
+ binding_filters: dict[str, Any] | None = None,
+ perturbation_filters: dict[str, Any] | None = None,
+ dto_filters: dict[str, Any] | None = None,
+ fields: list[str] | None = None,
+ ) -> pd.DataFrame:
+ """
+ Query dto data filtered by binding and perturbation datasets.
+
+ This method uses the existing query() function to get binding dataset data
+ (which automatically includes DTO fields via comparative_analyses join),
+ then filters by perturbation_id using pandas.
+
+ :param binding_dataset: (repo_id, config_name) for binding_id source
+ :param perturbation_dataset: (repo_id, config_name) for perturbation_id source
+ :param binding_filters: Filters to apply on binding dataset
+ :param perturbation_filters: Filters to apply on perturbation dataset
+ :param dto_filters: Filters on DTO fields
+ : (e.g., {"pvalue": ("<=", 0.01)})
+ :param fields: Fields to return (None = all fields including DTO fields)
+ :return: DataFrame with matching DTO records
+
+ Examples:
+ # Basic usage: query DTO intersection
+ result = vdb.query_dto(
+ binding_dataset=("BrentLab/harbison_2004", "harbison_2004"),
+ perturbation_dataset=("BrentLab/hackett_2020", "hackett_2020"),
+ dto_filters={"pvalue": ("<=", 0.01)}
+ )
+
+ # With source dataset filters
+ result = vdb.query_dto(
+ binding_dataset=("BrentLab/harbison_2004", "harbison_2004"),
+ perturbation_dataset=("BrentLab/hackett_2020", "hackett_2020"),
+ binding_filters={"carbon_source": "glucose"},
+ perturbation_filters={"temperature_celsius": 30},
+ dto_filters={"pvalue": ("<=", 0.01), "fdr": ("<=", 0.05)},
+ fields=["sample_id", "binding_id", "perturbation_id", "pvalue", "fdr"]
+ )
+
+ """
+ # Step 1: Query binding dataset
+ binding_df = self.query(
+ datasets=[binding_dataset],
+ filters=binding_filters,
+ fields=fields, # If fields specified, query will handle it
+ )
+
+ if binding_df.empty:
+ return pd.DataFrame()
+
+ # Check if perturbation_id column exists (from DTO join)
+ if "perturbation_id" not in binding_df.columns:
+ # No DTO data joined, return empty
+ return pd.DataFrame()
+
+ # Step 2: Query perturbation dataset to get sample_ids
+ perturbation_repo, perturbation_config = perturbation_dataset
+ perturbation_df = self.query(
+ datasets=[perturbation_dataset],
+ filters=perturbation_filters,
+ fields=["sample_id"],
+ )
+
+ if perturbation_df.empty or "sample_id" not in perturbation_df.columns:
+ return pd.DataFrame()
+
+ # Step 3: Build composite IDs for perturbation dataset (with case variants)
+ perturbation_ids = set()
+ for sample_id in perturbation_df["sample_id"].astype(str).unique():
+ # Original format
+ perturbation_ids.add(
+ f"{perturbation_repo};{perturbation_config};{sample_id}"
+ )
+ # Capitalized variant (e.g., hackett_2020 -> Hackett_2020)
+ if "/" in perturbation_repo:
+ parts = perturbation_repo.split("/", 1)
+ if len(parts) == 2 and parts[1]:
+ alt_repo = f"{parts[0]}/{parts[1][0].upper()}{parts[1][1:]}"
+ perturbation_ids.add(
+ f"{alt_repo};{perturbation_config};{sample_id}"
+ )
+
+ # Step 4: Filter binding_df to only keep rows where perturbation_id matches
+ # Handle NaN values in perturbation_id
+ result_df = binding_df[
+ binding_df["perturbation_id"].isin(perturbation_ids)
+ ].copy()
+
+ if result_df.empty:
+ return pd.DataFrame()
+
+ # Step 5: Apply DTO filters if provided
+ if dto_filters:
+ # Get binding dataset info for filter application
+ binding_repo, binding_config = binding_dataset
+ result_df = self._apply_filters(
+ result_df, dto_filters, binding_repo, binding_config
+ )
+
+ # Step 6: Select requested fields if specified
+ if fields:
+ available_fields = [f for f in fields if f in result_df.columns]
+ if available_fields:
+ result_df = result_df[available_fields].copy()
+
+ return result_df
+
def materialize_views(self, datasets: list[tuple[str, str]] | None = None) -> None:
"""
Build and cache metadata DataFrames for faster subsequent queries.
@@ -605,296 +614,6 @@ def invalidate_cache(self, datasets: list[tuple[str, str]] | None = None) -> Non
if dataset_key in self.cache:
del self.cache[dataset_key]
- def _build_comparative_links(self) -> dict[tuple[str, str], dict[str, Any]]:
- """
- Build mapping of primary datasets to their comparative dataset references.
-
- Returns dict keyed by (repo_id, config_name) with value being dict: {
- "comparative_analyses": [ { "repo": comparative_repo_id,
- "dataset": comparative_config_name, "via_field":
- field_name_with_composite_ids } ] }
-
- """
- links: dict[tuple[str, str], dict[str, Any]] = {}
-
- for repo_id, repo_config in self.config.repositories.items():
- if not repo_config.dataset:
- continue
-
- for config_name, dataset_config in repo_config.dataset.items():
- if dataset_config.comparative_analyses:
- links[(repo_id, config_name)] = {
- "comparative_analyses": [
- {
- "repo": ca.repo,
- "dataset": ca.dataset,
- "via_field": ca.via_field,
- }
- for ca in dataset_config.comparative_analyses
- ]
- }
-
- return links
-
- def _get_comparative_fields_for_dataset(
- self, repo_id: str, config_name: str
- ) -> dict[str, dict[str, str]]:
- """
- Get mapping of comparative fields available for a primary dataset.
-
- :param repo_id: Primary dataset repository ID
- :param config_name: Primary dataset config name
- :return: Dict mapping field_name to comparative dataset info
- {field_name: {
- "comp_repo": comparative_repo_id,
- "comp_dataset": comparative_dataset_name,
- "via_field": field_with_composite_ids
- }}
-
- Example:
- For callingcards dataset linked to DTO via binding_id:
- {
- "dto_fdr": {
- "comp_repo": "BrentLab/yeast_comparative_analysis",
- "comp_dataset": "dto",
- "via_field": "binding_id"
- },
- "dto_empirical_pvalue": {...}
- }
-
- """
- field_mapping: dict[str, dict[str, str]] = {}
-
- # Get comparative analyses for this dataset
- links = self._comparative_links.get((repo_id, config_name), {})
- if "comparative_analyses" not in links:
- return field_mapping
-
- # For each comparative dataset, get its fields
- for ca in links["comparative_analyses"]:
- comp_repo = ca["repo"]
- comp_dataset = ca["dataset"]
- via_field = ca["via_field"]
-
- # Get fields from comparative dataset
- comp_fields = self.get_fields(comp_repo, comp_dataset)
-
- # If no fields from config, try DataCard
- if not comp_fields:
- try:
- from tfbpapi.datacard import DataCard
-
- card = DataCard(comp_repo, token=self.token)
- config = card.get_config(comp_dataset)
- if config and config.dataset_info:
- comp_fields = [f.name for f in config.dataset_info.features]
- except Exception:
- comp_fields = []
-
- # Map each field to this comparative dataset
- for field_name in comp_fields:
- # Skip the via_field itself (it's the join key)
- if field_name == via_field:
- continue
-
- field_mapping[field_name] = {
- "comp_repo": comp_repo,
- "comp_dataset": comp_dataset,
- "via_field": via_field,
- }
-
- return field_mapping
-
- def _enrich_with_comparative_data(
- self,
- primary_df: pd.DataFrame,
- repo_id: str,
- config_name: str,
- requested_fields: list[str],
- ) -> pd.DataFrame:
- """
- Enrich primary dataset with fields from comparative datasets.
-
- :param primary_df: Primary dataset DataFrame with sample_id column
- :param repo_id: Primary dataset repository ID
- :param config_name: Primary dataset config name
- :param requested_fields: List of field names requested by user
- :return: DataFrame enriched with comparative fields
-
- """
- # Get mapping of which fields come from which comparative datasets
- comp_field_mapping = self._get_comparative_fields_for_dataset(
- repo_id, config_name
- )
-
- if not comp_field_mapping:
- return primary_df
-
- # Find which requested fields are from comparative datasets
- comp_fields_to_fetch = [f for f in requested_fields if f in comp_field_mapping]
-
- if not comp_fields_to_fetch:
- return primary_df
-
- # Group fields by comparative dataset to minimize queries
- by_comp_dataset: dict[tuple[str, str, str], list[str]] = {}
- for field in comp_fields_to_fetch:
- info = comp_field_mapping[field]
- key = (info["comp_repo"], info["comp_dataset"], info["via_field"])
- if key not in by_comp_dataset:
- by_comp_dataset[key] = []
- by_comp_dataset[key].append(field)
-
- # For each comparative dataset, load and join
- result_df = primary_df.copy()
-
- for (comp_repo, comp_dataset, via_field), fields in by_comp_dataset.items():
- try:
- # Load comparative dataset using HfCacheManager
- # but query the raw data table instead of metadata view
- from tfbpapi.hf_cache_manager import HfCacheManager
-
- comp_cache_mgr = HfCacheManager(
- comp_repo, duckdb_conn=duckdb.connect(":memory:"), token=self.token
- )
-
- # Get the config to load data
- comp_config = comp_cache_mgr.get_config(comp_dataset)
- if not comp_config:
- continue
-
- # Load the data (this will download and register parquet files)
- result = comp_cache_mgr._get_metadata_for_config(comp_config)
- if not result.get("success", False):
- continue
-
- # Now query the raw data table directly (not the metadata view)
- # The raw table name is config_name without "metadata_" prefix
- select_fields = [via_field] + fields
- columns = ", ".join(select_fields)
-
- # Query the actual parquet data by creating a view from the files
- try:
- # Get file paths that were loaded
- import glob
-
- from huggingface_hub import snapshot_download
-
- cache_dir = snapshot_download(
- repo_id=comp_repo,
- repo_type="dataset",
- allow_patterns=f"{comp_dataset}/**/*.parquet",
- token=self.token,
- )
-
- parquet_files = glob.glob(
- f"{cache_dir}/{comp_dataset}/**/*.parquet", recursive=True
- )
-
- if not parquet_files:
- continue
-
- # Create a temporary view from parquet files
- temp_view = f"temp_{comp_dataset}_raw"
- files_sql = ", ".join([f"'{f}'" for f in parquet_files])
- comp_cache_mgr.duckdb_conn.execute(
- f"CREATE OR REPLACE VIEW {temp_view} AS "
- f"SELECT * FROM read_parquet([{files_sql}])"
- )
-
- # Query the view
- sql = f"SELECT {columns} FROM {temp_view}"
- comp_df = comp_cache_mgr.duckdb_conn.execute(sql).fetchdf()
-
- except Exception:
- # If direct parquet loading fails, skip this comparative dataset
- continue
-
- if comp_df.empty:
- continue
-
- # Parse composite identifiers to extract sample_id
- # via_field contains values like
- # "BrentLab/harbison_2004;harbison_2004;123"
- # We need to extract the third component and match on
- # current repo/config
- def extract_sample_id(composite_id: str) -> str | None:
- """Extract sample_id if composite matches current dataset."""
- if pd.isna(composite_id):
- return None
- try:
- parts = composite_id.split(";")
- if len(parts) != 3:
- return None
- # Check if this composite ID references our dataset
- if parts[0] == repo_id and parts[1] == config_name:
- return parts[2]
- return None
- except Exception:
- return None
-
- comp_df["_join_sample_id"] = comp_df[via_field].apply(extract_sample_id)
-
- # Convert _join_sample_id to match primary_df sample_id dtype
- # This handles cases where sample_id is int but composite has string
- if "_join_sample_id" in comp_df.columns:
- primary_dtype = primary_df["sample_id"].dtype
- if pd.api.types.is_integer_dtype(primary_dtype):
- # Convert to numeric, coercing errors to NaN
- comp_df["_join_sample_id"] = pd.to_numeric(
- comp_df["_join_sample_id"], errors="coerce"
- )
- elif pd.api.types.is_string_dtype(primary_dtype):
- comp_df["_join_sample_id"] = comp_df["_join_sample_id"].astype(
- str
- )
-
- # Filter to only rows that match our dataset
- comp_df = comp_df[comp_df["_join_sample_id"].notna()].copy()
-
- if comp_df.empty:
- continue
-
- # Drop the via_field column (we don't need it in results)
- comp_df = comp_df.drop(columns=[via_field])
-
- # Merge with primary data
- result_df = result_df.merge(
- comp_df, left_on="sample_id", right_on="_join_sample_id", how="left"
- )
-
- # Drop the temporary join column
- result_df = result_df.drop(columns=["_join_sample_id"])
-
- except Exception:
- # If enrichment fails for this comparative dataset, continue
- continue
-
- return result_df
-
- @staticmethod
- def _parse_composite_identifier(composite_id: str) -> tuple[str, str, str]:
- """
- Parse composite sample identifier into components.
-
- :param composite_id: Composite ID in format "repo_id;config_name;sample_id"
- :return: Tuple of (repo_id, config_name, sample_id)
-
- Example:
- _parse_composite_identifier(
- "BrentLab/harbison_2004;harbison_2004;sample_42"
- )
- Returns: ("BrentLab/harbison_2004", "harbison_2004", "sample_42")
-
- """
- parts = composite_id.split(";")
- if len(parts) != 3:
- raise ValueError(
- f"Invalid composite ID format: {composite_id}. "
- "Expected 'repo_id;config_name;sample_id'"
- )
- return parts[0], parts[1], parts[2]
-
def _build_metadata_table(
self, repo_id: str, config_name: str, use_cache: bool = True
) -> pd.DataFrame:
@@ -941,19 +660,23 @@ def _build_metadata_table(
# Get sample-level data from HuggingFace
config = card.get_config(config_name)
- # Check if this is a comparative dataset
- from tfbpapi.models import DatasetType
-
- is_comparative = (
- config
- and hasattr(config, "dataset_type")
- and config.dataset_type == DatasetType.COMPARATIVE
- )
+ # Check if sample_id exists in the data by trying a sample query
+ has_sample_id = False
+ try:
+ sample_df = cache_mgr.query(
+ f"SELECT sample_id FROM {config_name} LIMIT 1", config_name
+ )
+ has_sample_id = "sample_id" in sample_df.columns
+ except Exception:
+ # If query fails, assume sample_id doesn't exist
+ has_sample_id = False
if config and hasattr(config, "metadata_fields") and config.metadata_fields:
# Select only metadata fields
columns = ", ".join(config.metadata_fields)
- if not is_comparative and "sample_id" not in config.metadata_fields:
+ # Only add sample_id field if it exists in the data
+ # and not already in metadata_fields
+ if has_sample_id and "sample_id" not in config.metadata_fields:
columns = f"sample_id, {columns}"
sql = f"SELECT DISTINCT {columns} FROM {config_name}"
else:
@@ -962,9 +685,12 @@ def _build_metadata_table(
df = cache_mgr.query(sql, config_name)
- # For non-comparative datasets: one row per sample_id
- # For comparative datasets: keep all rows (each row is a relationship)
- if not is_comparative and "sample_id" in df.columns:
+ # If sample_id doesn't exist, generate from row number
+ if "sample_id" not in df.columns and not df.empty:
+ df["sample_id"] = df.index.astype(str)
+
+ # One row per sample_id
+ if "sample_id" in df.columns:
df = df.groupby("sample_id").first().reset_index()
# Add repo-level metadata as columns
@@ -976,8 +702,8 @@ def _build_metadata_table(
if field_metadata:
df = self._add_field_metadata(df, field_metadata)
- # Apply dtype conversions to DataFrame columns
- df = self._apply_column_dtypes(df, property_mappings)
+ # Join comparative analyses data if configured
+ df = self._join_comparative_analyses(df, repo_id, config_name)
# Cache result
if use_cache:
@@ -985,73 +711,9 @@ def _build_metadata_table(
return df
- except Exception as e:
- # Log error for debugging with full traceback
- import traceback
-
- print(f"Error downloading metadata for {config_name}: {e}")
- traceback.print_exc()
- # Return empty DataFrame on error
+ except Exception:
return pd.DataFrame()
- def _apply_column_dtypes(
- self, df: pd.DataFrame, property_mappings: dict[str, PropertyMapping]
- ) -> pd.DataFrame:
- """
- Apply dtype conversions to DataFrame columns based on property mappings.
-
- :param df: DataFrame to apply conversions to
- :param property_mappings: Property mappings with dtype specifications
- :return: DataFrame with converted column dtypes
-
- """
- for prop_name, mapping in property_mappings.items():
- # Skip if no dtype specified or column doesn't exist
- if not mapping.dtype or prop_name not in df.columns:
- continue
-
- # Convert column dtype
- try:
- if mapping.dtype == "numeric":
- df[prop_name] = pd.to_numeric(df[prop_name], errors="coerce")
- elif mapping.dtype == "bool":
- df[prop_name] = df[prop_name].astype(bool)
- elif mapping.dtype == "string":
- df[prop_name] = df[prop_name].astype(str)
- except (ValueError, TypeError):
- # Conversion failed, leave as is
- pass
-
- return df
-
- def _convert_dtype(self, value: Any, dtype: str) -> Any:
- """
- Convert value to specified data type.
-
- :param value: The value to convert to a given `dtype`
- :param dtype: Target data type ("numeric", "bool", "string")
-
- :return: Converted value or None if conversion fails
-
- """
- if value is None:
- return None
-
- try:
- if dtype == "numeric":
- # Try float first (handles both int and float)
- return float(value)
- elif dtype == "bool":
- return bool(value)
- elif dtype == "string":
- return str(value)
- else:
- # Unknown dtype, pass through unchanged
- return value
- except (ValueError, TypeError):
- # Conversion failed, return None
- return None
-
def _extract_repo_level(
self,
card: DataCard,
@@ -1085,12 +747,14 @@ def _extract_repo_level(
continue
# Build full path
- # Note: `conditions` is already the experimental_conditions dict,
- # so we don't add the prefix
full_path = mapping.path
+ # Skip if path is None (shouldn't happen for repo-level, but be safe)
+ if full_path is None:
+ continue
+
# Get value at path
- value = get_nested_value(conditions, full_path) # type: ignore
+ value = get_nested_value(conditions, full_path)
# Handle missing values
missing_label = self.config.missing_value_labels.get(prop_name)
@@ -1102,12 +766,6 @@ def _extract_repo_level(
# Ensure value is a list
actual_values = [value] if not isinstance(value, list) else value
- # Apply dtype conversion if specified
- if mapping.dtype:
- actual_values = [
- self._convert_dtype(v, mapping.dtype) for v in actual_values
- ]
-
# Normalize using aliases
aliases = self.config.factor_aliases.get(prop_name)
normalized_values = [
@@ -1136,18 +794,17 @@ def _extract_field_level(
field_metadata: dict[str, dict[str, Any]] = {}
# Group property mappings by field
- field_mappings: dict[str, dict[str, PropertyMapping]] = {}
+ field_mappings: dict[str, dict[str, str | None]] = {}
for prop_name, mapping in property_mappings.items():
- # Only process if field is specified AND path exists
- # (no path means it's just a column alias, not metadata extraction)
- if mapping.field is not None and mapping.path is not None:
+ if mapping.field is not None:
field_name = mapping.field
if field_name not in field_mappings:
field_mappings[field_name] = {}
- field_mappings[field_name][prop_name] = mapping
+ # Store path (can be None for column aliases)
+ field_mappings[field_name][prop_name] = mapping.path
# Process each field that has mappings
- for field_name, prop_mappings_dict in field_mappings.items():
+ for field_name, prop_paths in field_mappings.items():
# Get field definitions
definitions = card.get_field_definitions(config_name, field_name)
if not definitions:
@@ -1158,9 +815,13 @@ def _extract_field_level(
if field_value not in field_metadata:
field_metadata[field_value] = {}
- for prop_name, mapping in prop_mappings_dict.items():
- # Get value at path
- value = get_nested_value(definition, mapping.path) # type: ignore
+ for prop_name, path in prop_paths.items():
+ # Handle path=None case: use field_value directly
+ if path is None:
+ value = field_value
+ else:
+ # Get value at path
+ value = get_nested_value(definition, path)
# Handle missing values
missing_label = self.config.missing_value_labels.get(prop_name)
@@ -1172,12 +833,6 @@ def _extract_field_level(
# Ensure value is a list
actual_values = [value] if not isinstance(value, list) else value
- # Apply dtype conversion if specified
- if mapping.dtype:
- actual_values = [
- self._convert_dtype(v, mapping.dtype) for v in actual_values
- ]
-
# Normalize using aliases
aliases = self.config.factor_aliases.get(prop_name)
normalized_values = [
@@ -1243,23 +898,31 @@ def _apply_filters(
# Handle numeric range filters
if isinstance(filter_value, tuple):
operator = filter_value[0]
+ # For numeric comparisons, try to convert column to numeric
+ # (normalize_value returns strings,
+ # but we need numeric for range queries)
+ try:
+ df_field = pd.to_numeric(df[field], errors="coerce")
+ except (ValueError, TypeError):
+ df_field = df[field]
+
if operator == "between" and len(filter_value) == 3:
df = df[
- (df[field] >= filter_value[1]) & (df[field] <= filter_value[2])
+ (df_field >= filter_value[1]) & (df_field <= filter_value[2])
]
elif operator in (">=", ">", "<=", "<", "==", "!="):
if operator == ">=":
- df = df[df[field] >= filter_value[1]]
+ df = df[df_field >= filter_value[1]]
elif operator == ">":
- df = df[df[field] > filter_value[1]]
+ df = df[df_field > filter_value[1]]
elif operator == "<=":
- df = df[df[field] <= filter_value[1]]
+ df = df[df_field <= filter_value[1]]
elif operator == "<":
- df = df[df[field] < filter_value[1]]
+ df = df[df_field < filter_value[1]]
elif operator == "==":
- df = df[df[field] == filter_value[1]]
+ df = df[df_field == filter_value[1]]
elif operator == "!=":
- df = df[df[field] != filter_value[1]]
+ df = df[df_field != filter_value[1]]
else:
# Exact match with alias expansion
aliases = self.config.factor_aliases.get(field)
@@ -1273,9 +936,11 @@ def _apply_filters(
df = df[df[field].isin(expanded_values)]
else:
# No aliases, exact match
- df = df[df[field] == filter_value]
+ # Handle type conversion: normalize_value returns strings,
+ # so convert filter_value to string for comparison
+ df = df[df[field] == str(filter_value)]
- return df
+ return df.copy()
def _get_complete_data(
self,
@@ -1331,6 +996,264 @@ def _get_complete_data(
except Exception:
return pd.DataFrame()
+ @staticmethod
+ def _parse_composite_identifier(composite_id: str) -> tuple[str, str, str]:
+ """
+ Parse composite identifier into repo_id, config_name, and sample_id.
+
+ Format: "repo_id;config_name;sample_id"
+
+ :param composite_id: Composite identifier string
+ :return: Tuple of (repo_id, config_name, sample_id)
+ :raises ValueError: If format is invalid
+
+ Example:
+ >>> VirtualDB._parse_composite_identifier(
+ ... "BrentLab/harbison_2004;harbison_2004;42"
+ ... )
+ ("BrentLab/harbison_2004", "harbison_2004", "42")
+
+ """
+ parts = composite_id.split(";")
+ if len(parts) != 3:
+ raise ValueError(
+ f"Invalid composite ID format: {composite_id}. "
+ "Expected format: 'repo_id;config_name;sample_id'"
+ )
+ return tuple(parts) # type: ignore
+
+ def _join_comparative_analyses(
+ self, df: pd.DataFrame, repo_id: str, config_name: str
+ ) -> pd.DataFrame:
+ """
+ Join comparative analyses data to the primary dataset DataFrame.
+
+ For each comparative_analysis configured for this dataset, loads the comparative
+ dataset directly via SQL and joins fields via composite identifiers.
+
+ :param df: Primary dataset DataFrame with sample_id column
+ :param repo_id: Repository ID of the primary dataset
+ :param config_name: Config name of the primary dataset
+ :return: DataFrame with joined comparative analysis fields
+
+ """
+ if df.empty or "sample_id" not in df.columns:
+ return df
+
+ # Get dataset configuration
+ repo_config = self.config.get_repository_config(repo_id)
+ if not repo_config or not repo_config.dataset:
+ return df
+
+ dataset_config = repo_config.dataset.get(config_name)
+ if not dataset_config or not dataset_config.comparative_analyses:
+ return df
+
+ result_df = df.copy()
+
+ # Process each comparative analysis
+ for comp_analysis in dataset_config.comparative_analyses:
+ try:
+ # Build composite identifier column for join
+ # Format: "repo_id;config_name;sample_id"
+ temp_composite_col = "_temp_composite_id"
+ result_df[temp_composite_col] = f"{repo_id};{config_name};" + result_df[
+ "sample_id"
+ ].astype(str)
+
+ # Get property mappings for comparative dataset
+ comp_mappings = self.config.get_property_mappings(
+ comp_analysis.repo, comp_analysis.dataset
+ )
+
+ # Build mapping from property names to actual column names
+ # PropertyMapping.field points to the actual column name in the dataset
+ prop_to_col: dict[str, str] = {}
+ for prop_name, mapping in comp_mappings.items():
+ if mapping.field:
+ prop_to_col[prop_name] = mapping.field
+ else:
+ prop_to_col[prop_name] = prop_name
+
+ # Get via_field actual column name
+ via_field = comp_analysis.via_field
+ via_field_col = prop_to_col.get(via_field, via_field)
+
+ # Determine which fields to select from comparative dataset
+ # Exclude via_field and sample_id from the join fields
+ fields_to_join_props = [
+ prop_name
+ for prop_name in prop_to_col.keys()
+ if prop_name not in ["sample_id", via_field, "dataset_id"]
+ ]
+
+ if not fields_to_join_props:
+ result_df = result_df.drop(columns=[temp_composite_col])
+ continue
+
+ # Build SQL columns: actual column names
+ sql_columns = [via_field_col]
+ sql_columns.extend([prop_to_col[prop] for prop in fields_to_join_props])
+ sql_columns_str = ", ".join(sql_columns)
+
+ # Load comparative dataset directly via SQL
+ # This bypasses _build_metadata_table which may not work
+ # for comparative datasets
+ comp_cache_mgr = HfCacheManager(
+ comp_analysis.repo,
+ duckdb_conn=duckdb.connect(":memory:"),
+ token=self.token,
+ )
+
+ # Get the actual table name (metadata_{config_name})
+ # to avoid string replacement issues
+ comp_config = comp_cache_mgr.get_config(comp_analysis.dataset)
+ if not comp_config:
+ result_df = result_df.drop(columns=[temp_composite_col])
+ continue
+
+ # Load the config to get the actual table name
+ config_result = comp_cache_mgr._get_metadata_for_config(
+ comp_config, force_refresh=False
+ )
+ if not config_result.get("success", False):
+ result_df = result_df.drop(columns=[temp_composite_col])
+ continue
+
+ actual_table_name = config_result.get("table_name")
+ if not actual_table_name:
+ actual_table_name = f"metadata_{comp_analysis.dataset}"
+
+ # Build WHERE clause to filter only matching records
+ # Try both original repo_id and capitalized version
+ # (e.g., hackett_2020 -> Hackett_2020)
+ composite_ids = result_df[temp_composite_col].unique().tolist()
+
+ # Generate alternative repo_id format with capitalized
+ # first letter after slash
+ # e.g., "BrentLab/hackett_2020" -> "BrentLab/Hackett_2020"
+ alternative_repo_id = repo_id
+ if "/" in repo_id:
+ parts = repo_id.split("/", 1)
+ if len(parts) == 2 and parts[1]:
+ # Capitalize first letter of dataset name
+ alternative_repo_id = (
+ f"{parts[0]}/{parts[1][0].upper()}{parts[1][1:]}"
+ )
+
+ # Build composite IDs with both formats
+ all_composite_ids = set(composite_ids) # Original format
+ if alternative_repo_id != repo_id:
+ # Add alternative format for each sample_id
+ for sample_id in result_df["sample_id"].astype(str).unique():
+ alt_id = f"{alternative_repo_id};{config_name};{sample_id}"
+ all_composite_ids.add(alt_id)
+
+ # Add forward-slash format variants
+ # (e.g., "BrentLab/rossi_2021/rossi_2021_af_combined;{sample_id}")
+ # This handles cases where DTO data uses "/" instead of ";"
+ # between repo_id and config_name
+ for sample_id in result_df["sample_id"].astype(str).unique():
+ # Format: "repo_id/config_name;sample_id"
+ slash_format_id = f"{repo_id}/{config_name};{sample_id}"
+ all_composite_ids.add(slash_format_id)
+ # Also add capitalized variant if applicable
+ if alternative_repo_id != repo_id:
+ alt_slash_format_id = (
+ f"{alternative_repo_id}/{config_name};{sample_id}"
+ )
+ all_composite_ids.add(alt_slash_format_id)
+
+ # Escape single quotes in composite IDs
+ escaped_ids = [cid.replace("'", "''") for cid in all_composite_ids]
+ id_list = ", ".join([f"'{cid}'" for cid in escaped_ids])
+
+ # Use actual table name directly to avoid column name replacement issues
+ sql = f"""
+ SELECT {sql_columns_str}
+ FROM {actual_table_name}
+ WHERE {via_field_col} IN ({id_list})
+ """
+
+ # Execute query directly instead of using query() method
+ # to avoid string replacement
+ try:
+ comp_df = comp_cache_mgr.duckdb_conn.execute(sql).fetchdf()
+
+ except Exception:
+ result_df = result_df.drop(columns=[temp_composite_col])
+ continue
+
+ if comp_df.empty:
+ result_df = result_df.drop(columns=[temp_composite_col])
+ continue
+
+ # Rename columns to use property names (config names)
+ # instead of raw column names
+ rename_dict = {via_field_col: via_field}
+ for prop_name in fields_to_join_props:
+ actual_col = prop_to_col[prop_name]
+ if actual_col != prop_name:
+ rename_dict[actual_col] = prop_name
+
+ comp_df = comp_df.rename(columns=rename_dict)
+
+ # Map DTO composite IDs back to original format for join
+ # This handles cases where DTO uses:
+ # 1. Capitalized repo_id (e.g., Hackett_2020)
+ # 2. Forward-slash format
+ # (e.g., "BrentLab/rossi_2021/rossi_2021_af_combined;{sample_id}")
+ if via_field in comp_df.columns:
+ # Create mapping from all alternative formats to original format
+ id_mapping = {}
+ for sample_id in result_df["sample_id"].astype(str).unique():
+ original_id = f"{repo_id};{config_name};{sample_id}"
+
+ # Add capitalized variant mapping
+ if alternative_repo_id != repo_id:
+ alt_id = f"{alternative_repo_id};{config_name};{sample_id}"
+ id_mapping[alt_id] = original_id
+
+ # Add forward-slash format mappings
+ slash_format_id = f"{repo_id}/{config_name};{sample_id}"
+ id_mapping[slash_format_id] = original_id
+ if alternative_repo_id != repo_id:
+ alt_slash_format_id = (
+ f"{alternative_repo_id}/{config_name};{sample_id}"
+ )
+ id_mapping[alt_slash_format_id] = original_id
+
+ # Map alternative format IDs to original format
+ comp_df[temp_composite_col] = comp_df[via_field].map(
+ lambda x: id_mapping.get(x, x) if x in id_mapping else x
+ )
+ # Use mapped column for join
+ join_right_on = temp_composite_col
+ else:
+ # No alternative format needed, use original via_field
+ join_right_on = via_field
+
+ # Perform left join on composite identifier
+ result_df = result_df.merge(
+ comp_df,
+ left_on=temp_composite_col,
+ right_on=join_right_on,
+ how="left",
+ suffixes=("", f"_{comp_analysis.dataset}"),
+ )
+
+ # Drop the temporary composite_id column
+ result_df = result_df.drop(columns=[temp_composite_col])
+
+ except Exception:
+
+ # Clean up temp column if it exists
+ if temp_composite_col in result_df.columns:
+ result_df = result_df.drop(columns=[temp_composite_col])
+ continue
+
+ return result_df
+
def __repr__(self) -> str:
"""String representation."""
n_repos = len(self.config.repositories)