diff --git a/docs/tutorials/DTO_analysis.ipynb b/docs/tutorials/DTO_analysis.ipynb new file mode 100644 index 0000000..4c11286 --- /dev/null +++ b/docs/tutorials/DTO_analysis.ipynb @@ -0,0 +1,968 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DTO Analysis: Significance Filtering of Cross-Dataset Binding Samples\n", + "\n", + "This notebook is used to analyze the correlation between transcription factor (TF) binding data and perturbation data.\n", + "\n", + "## Analysis Objectives\n", + "\n", + "1. Select all binding samples with DTO P<=0.01 compared to **Hackett-2020-ZEV**.\n", + "2. Select all binding samples with DTO P<=0.01 compared to **Kemmeren-2014-TFKO**.\n", + "3. Find the intersection of the two sets above.\n", + "4. For each regulator in the active set, count the number of active samples.\n", + "\n", + "## Challenges and Additional Analysis\n", + "\n", + "- Explore the time point effects in the Hackett data.\n", + "- Analyze the impact of different time points on the DTO distribution.\n", + "- Select the optimal conditions (e.g., ZEV vs GEV) for each regulator." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from collections import Counter\n", + "\n", + "# Set display options\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_rows', 100)\n", + "pd.set_option('display.width', None)\n", + "\n", + "# Set plot style\n", + "plt.style.use('seaborn-v0_8-whitegrid')\n", + "sns.set_palette('husl')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u2705 Configuration file saved at: /tmp/tmpv37ibelm/vdb_config.yaml\n" + ] + } + ], + "source": [ + "# Create VirtualDB configuration\n", + "# This configuration defines how to map the fields of different datasets and how to associate DTO comparative analysis data\n", + "\n", + "import tempfile\n", + "from pathlib import Path\n", + "\n", + "config_yaml = \"\"\"\n", + "repositories:\n", + " BrentLab/harbison_2004:\n", + " dataset:\n", + " harbison_2004:\n", + " sample_id:\n", + " field: sample_id\n", + " carbon_source:\n", + " field: condition\n", + " path: media.carbon_source.compound\n", + " temperature_celsius:\n", + " field: condition\n", + " path: temperature_celsius\n", + " dtype: numeric\n", + " environmental_condition:\n", + " field: condition\n", + " regualtor_locus_tag:\n", + " field: regulator_locus_tag\n", + " regulator_symbol:\n", + " field: regulator_symbol\n", + "\n", + " comparative_analyses:\n", + " - repo: BrentLab/yeast_comparative_analysis\n", + " dataset: dto\n", + " via_field: binding_id\n", + "\n", + " BrentLab/rossi_2021:\n", + " carbon_source: \n", + " path: media.carbon_source.compound\n", + " temperature_celsius: \n", + " path: temperature_celsius\n", + " dataset:\n", + " rossi_2021_af_combined:\n", + " sample_id: \n", + " field: sample_id\n", + " regulator_locus_tag:\n", + " field: regulator_locus_tag\n", + " target_locus_tag:\n", + " field: target_locus_tag\n", + "\n", + " comparative_analyses:\n", + " - repo: BrentLab/yeast_comparative_analysis\n", + " dataset: dto\n", + " via_field: binding_id\n", + "\n", + " BrentLab/mahendrawada_2025:\n", + " dataset:\n", + " reprocessed_diffcontrol_5prime:\n", + " sample_id:\n", + " field: sample_id\n", + " control_source:\n", + " field: control_source\n", + " regulator_locus_tag:\n", + " field: regulator_locus_tag\n", + " regulator_symbol:\n", + " field: regulator_symbol\n", + " environmental_condition:\n", + " field: condition\n", + " temperature_celsius:\n", + " field: condition\n", + " path: temperature_celsius\n", + " dtype: numeric\n", + " media_name:\n", + " field: condition\n", + " path: media.name\n", + " carbon_source:\n", + " field: condition\n", + " path: media.carbon_source\n", + "\n", + " comparative_analyses:\n", + " - repo: BrentLab/yeast_comparative_analysis\n", + " dataset: dto\n", + " via_field: binding_id\n", + "\n", + "\n", + " BrentLab/callingcards:\n", + " carbon_source: \n", + " path: media.carbon_source.compound\n", + " temperature_celsius: \n", + " path: temperature_celsius\n", + " dataset:\n", + " annotated_features:\n", + " id:\n", + " field: id\n", + " regulator_locus_tag:\n", + " field: target_locus_tag\n", + " regulator_symbol:\n", + " field: target_symbol\n", + " \n", + " comparative_analyses:\n", + " - repo: BrentLab/yeast_comparative_analysis\n", + " dataset: dto\n", + " via_field: binding_id\n", + " \n", + " BrentLab/hackett_2020:\n", + " dataset:\n", + " hackett_2020:\n", + " sample_id:\n", + " field: sample_id\n", + " dtype: numeric\n", + " regulator_locus_tag:\n", + " field: regulator_locus_tag\n", + " temperature_celsius:\n", + " path: temperature_celsius\n", + " dtype: numeric\n", + " cultivation_method:\n", + " path: cultivation_method\n", + " media_name:\n", + " path: media.name\n", + " induction_system:\n", + " field: mechanism\n", + " inducer_compound:\n", + " field: mechanism\n", + " path: definitions.inducer\n", + " nutrient_restriction:\n", + " field: restriction\n", + " log2fc:\n", + " field: log2_shrunken_timecourses\n", + " dtype: numeric\n", + " log2_raw_ratio:\n", + " field: log2_ratio\n", + " dtype: numeric\n", + " time_point:\n", + " field: time\n", + " dtype: numeric\n", + "\n", + " comparative_analyses:\n", + " - repo: BrentLab/yeast_comparative_analysis\n", + " dataset: dto\n", + " via_field: perturbation_id\n", + "\n", + " BrentLab/kemmeren_2014:\n", + " dataset:\n", + " kemmeren_2014:\n", + " sample_id:\n", + " field: sample_id\n", + " carbon_source:\n", + " path: media.carbon_source.compound\n", + " temperature_celsius:\n", + " path: temperature_celsius\n", + " dtype: numeric\n", + "\n", + " comparative_analyses:\n", + " - repo: BrentLab/yeast_comparative_analysis\n", + " dataset: dto\n", + " via_field: perturbation_id\n", + "\n", + " BrentLab/yeast_comparative_analysis:\n", + " dataset:\n", + " dto:\n", + " binding_id:\n", + " field: binding_id\n", + " perturbation_id:\n", + " field: perturbation_id\n", + " fdr:\n", + " field: dto_fdr\n", + " dtype: numeric\n", + " pvalue:\n", + " field: dto_empirical_pvalue\n", + " dtype: numeric\n", + " binding_threshold:\n", + " field: binding_rank_threshold\n", + " dtype: numeric\n", + " perturbation_threshold:\n", + " field: perturbation_rank_threshold\n", + " dtype: numeric\n", + " binding_set_size:\n", + " field: binding_set_size\n", + " dtype: numeric\n", + " perturbation_set_size:\n", + " field: perturbation_set_size\n", + " dtype: numeric\n", + "\n", + "factor_aliases:\n", + " carbon_source:\n", + " glucose: [D-glucose, dextrose, glu]\n", + " galactose: [D-galactose, gal]\n", + " raffinose: [D-raffinose]\n", + "\n", + "missing_value_labels:\n", + " carbon_source: \"unspecified\"\n", + "\n", + "description:\n", + " carbon_source: The carbon source provided during growth\n", + " temperature_celsius: Growth temperature in degrees Celsius\n", + " environmental_condition: Named environmental condition\n", + "\"\"\"\n", + "\n", + "# Save the configuration to a temporary file\n", + "temp_config = Path(tempfile.mkdtemp()) / \"vdb_config.yaml\"\n", + "temp_config.write_text(config_yaml)\n", + "\n", + "print(f\"\u2705 Configuration file saved at: {temp_config}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u2705 VirtualDB initialized successfully!\n", + "Number of configured repositories: 7\n", + "\n", + "Configured datasets:\n", + " - BrentLab/harbison_2004/harbison_2004\n", + " - BrentLab/rossi_2021/rossi_2021_af_combined\n", + " - BrentLab/mahendrawada_2025/reprocessed_diffcontrol_5prime\n", + " - BrentLab/callingcards/annotated_features\n", + " - BrentLab/hackett_2020/hackett_2020\n", + " - BrentLab/kemmeren_2014/kemmeren_2014\n", + " - BrentLab/yeast_comparative_analysis/dto\n" + ] + } + ], + "source": [ + "# Initialize VirtualDB\n", + "from tfbpapi.virtual_db import VirtualDB\n", + "\n", + "# Token authentication required\n", + "hf_token = \"\"\n", + "\n", + "vdb = VirtualDB(str(temp_config), token=hf_token)\n", + "\n", + "print(\"\u2705 VirtualDB initialized successfully!\")\n", + "print(f\"Number of configured repositories: {len(vdb.config.repositories)}\")\n", + "\n", + "# List all configured datasets\n", + "print(\"\\nConfigured datasets:\")\n", + "for repo_id, repo_config in vdb.config.repositories.items():\n", + " if repo_config.dataset:\n", + " for config_name in repo_config.dataset.keys():\n", + " print(f\" - {repo_id}/{config_name}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Added the ability to perform comparative analysis to the query." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 6 files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00<00:00, 51569.31it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idcarbon_sourcetemperature_celsiuspvalueperturbation_iddataset_id
01glucose30NaNNaNBrentLab/harbison_2004/harbison_2004
12glucose300.301BrentLab/kemmeren_2014;kemmeren_2014;18BrentLab/harbison_2004/harbison_2004
22glucose30NaNBrentLab/Hackett_2020;hackett_2020;33BrentLab/harbison_2004/harbison_2004
32glucose300.512BrentLab/Hackett_2020;hackett_2020;34BrentLab/harbison_2004/harbison_2004
42glucose300.306BrentLab/Hackett_2020;hackett_2020;40BrentLab/harbison_2004/harbison_2004
52glucose30NaNBrentLab/Hackett_2020;hackett_2020;37BrentLab/harbison_2004/harbison_2004
62glucose300.309BrentLab/Hackett_2020;hackett_2020;38BrentLab/harbison_2004/harbison_2004
72glucose300.644BrentLab/Hackett_2020;hackett_2020;36BrentLab/harbison_2004/harbison_2004
82glucose300.411BrentLab/Hackett_2020;hackett_2020;35BrentLab/harbison_2004/harbison_2004
92glucose300.536BrentLab/Hackett_2020;hackett_2020;39BrentLab/harbison_2004/harbison_2004
\n", + "
" + ], + "text/plain": [ + " sample_id carbon_source temperature_celsius pvalue \\\n", + "0 1 glucose 30 NaN \n", + "1 2 glucose 30 0.301 \n", + "2 2 glucose 30 NaN \n", + "3 2 glucose 30 0.512 \n", + "4 2 glucose 30 0.306 \n", + "5 2 glucose 30 NaN \n", + "6 2 glucose 30 0.309 \n", + "7 2 glucose 30 0.644 \n", + "8 2 glucose 30 0.411 \n", + "9 2 glucose 30 0.536 \n", + "\n", + " perturbation_id \\\n", + "0 NaN \n", + "1 BrentLab/kemmeren_2014;kemmeren_2014;18 \n", + "2 BrentLab/Hackett_2020;hackett_2020;33 \n", + "3 BrentLab/Hackett_2020;hackett_2020;34 \n", + "4 BrentLab/Hackett_2020;hackett_2020;40 \n", + "5 BrentLab/Hackett_2020;hackett_2020;37 \n", + "6 BrentLab/Hackett_2020;hackett_2020;38 \n", + "7 BrentLab/Hackett_2020;hackett_2020;36 \n", + "8 BrentLab/Hackett_2020;hackett_2020;35 \n", + "9 BrentLab/Hackett_2020;hackett_2020;39 \n", + "\n", + " dataset_id \n", + "0 BrentLab/harbison_2004/harbison_2004 \n", + "1 BrentLab/harbison_2004/harbison_2004 \n", + "2 BrentLab/harbison_2004/harbison_2004 \n", + "3 BrentLab/harbison_2004/harbison_2004 \n", + "4 BrentLab/harbison_2004/harbison_2004 \n", + "5 BrentLab/harbison_2004/harbison_2004 \n", + "6 BrentLab/harbison_2004/harbison_2004 \n", + "7 BrentLab/harbison_2004/harbison_2004 \n", + "8 BrentLab/harbison_2004/harbison_2004 \n", + "9 BrentLab/harbison_2004/harbison_2004 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_p001 = vdb.query(\n", + " datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n", + " fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\", \"pvalue\", \"perturbation_id\"],\n", + ")\n", + "\n", + "all_p001.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 6 files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00<00:00, 49152.00it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idtemperature_celsiusperturbation_idbinding_idpvaluedataset_id
03430BrentLab/Hackett_2020;hackett_2020;34BrentLab/callingcards;annotated_features;3940.010BrentLab/hackett_2020/hackett_2020
13830BrentLab/Hackett_2020;hackett_2020;38BrentLab/callingcards;annotated_features;3800.003BrentLab/hackett_2020/hackett_2020
23930BrentLab/Hackett_2020;hackett_2020;39BrentLab/callingcards;annotated_features;3800.000BrentLab/hackett_2020/hackett_2020
33930BrentLab/Hackett_2020;hackett_2020;39BrentLab/callingcards;annotated_features;7480.000BrentLab/hackett_2020/hackett_2020
44030BrentLab/Hackett_2020;hackett_2020;40BrentLab/callingcards;annotated_features;3800.000BrentLab/hackett_2020/hackett_2020
54030BrentLab/Hackett_2020;hackett_2020;40BrentLab/callingcards;annotated_features;3940.000BrentLab/hackett_2020/hackett_2020
64030BrentLab/Hackett_2020;hackett_2020;40BrentLab/callingcards;annotated_features;7480.000BrentLab/hackett_2020/hackett_2020
74430BrentLab/Hackett_2020;hackett_2020;44BrentLab/callingcards;annotated_features;340.000BrentLab/hackett_2020/hackett_2020
84430BrentLab/Hackett_2020;hackett_2020;44BrentLab/harbison_2004;harbison_2004;70.009BrentLab/hackett_2020/hackett_2020
94430BrentLab/Hackett_2020;hackett_2020;44BrentLab/harbison_2004;harbison_2004;80.007BrentLab/hackett_2020/hackett_2020
\n", + "
" + ], + "text/plain": [ + " sample_id temperature_celsius perturbation_id \\\n", + "0 34 30 BrentLab/Hackett_2020;hackett_2020;34 \n", + "1 38 30 BrentLab/Hackett_2020;hackett_2020;38 \n", + "2 39 30 BrentLab/Hackett_2020;hackett_2020;39 \n", + "3 39 30 BrentLab/Hackett_2020;hackett_2020;39 \n", + "4 40 30 BrentLab/Hackett_2020;hackett_2020;40 \n", + "5 40 30 BrentLab/Hackett_2020;hackett_2020;40 \n", + "6 40 30 BrentLab/Hackett_2020;hackett_2020;40 \n", + "7 44 30 BrentLab/Hackett_2020;hackett_2020;44 \n", + "8 44 30 BrentLab/Hackett_2020;hackett_2020;44 \n", + "9 44 30 BrentLab/Hackett_2020;hackett_2020;44 \n", + "\n", + " binding_id pvalue \\\n", + "0 BrentLab/callingcards;annotated_features;394 0.010 \n", + "1 BrentLab/callingcards;annotated_features;380 0.003 \n", + "2 BrentLab/callingcards;annotated_features;380 0.000 \n", + "3 BrentLab/callingcards;annotated_features;748 0.000 \n", + "4 BrentLab/callingcards;annotated_features;380 0.000 \n", + "5 BrentLab/callingcards;annotated_features;394 0.000 \n", + "6 BrentLab/callingcards;annotated_features;748 0.000 \n", + "7 BrentLab/callingcards;annotated_features;34 0.000 \n", + "8 BrentLab/harbison_2004;harbison_2004;7 0.009 \n", + "9 BrentLab/harbison_2004;harbison_2004;8 0.007 \n", + "\n", + " dataset_id \n", + "0 BrentLab/hackett_2020/hackett_2020 \n", + "1 BrentLab/hackett_2020/hackett_2020 \n", + "2 BrentLab/hackett_2020/hackett_2020 \n", + "3 BrentLab/hackett_2020/hackett_2020 \n", + "4 BrentLab/hackett_2020/hackett_2020 \n", + "5 BrentLab/hackett_2020/hackett_2020 \n", + "6 BrentLab/hackett_2020/hackett_2020 \n", + "7 BrentLab/hackett_2020/hackett_2020 \n", + "8 BrentLab/hackett_2020/hackett_2020 \n", + "9 BrentLab/hackett_2020/hackett_2020 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Even if not specified in the `fields` parameter, the filter options will still be retained within the `fields` parameter.\n", + "all_p001 = vdb.query(\n", + " datasets=[(\"BrentLab/hackett_2020\", \"hackett_2020\")],\n", + " filters={\n", + " \"pvalue\": (\"<=\", 0.01) \n", + " },\n", + " fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\", \"perturbation_id\",\"binding_id\"],\n", + ")\n", + "\n", + "all_p001.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a function called query_dto that is specifically responsible for retrieving the DTO data for the specified binding and perturbation datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 126 DTO records\n", + "Column names: ['binding_id', 'perturbation_id', 'pvalue', 'fdr', 'sample_id', 'carbon_source', 'temperature_celsius']\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
binding_idperturbation_idpvaluefdrsample_idcarbon_sourcetemperature_celsius
12BrentLab/harbison_2004;harbison_2004;3BrentLab/Hackett_2020;hackett_2020;850.0040.0002253glucose30
17BrentLab/harbison_2004;harbison_2004;3BrentLab/Hackett_2020;hackett_2020;870.0100.0002253glucose30
18BrentLab/harbison_2004;harbison_2004;3BrentLab/Hackett_2020;hackett_2020;820.0050.0002253glucose30
50BrentLab/harbison_2004;harbison_2004;7BrentLab/Hackett_2020;hackett_2020;440.0090.0224957glucose30
59BrentLab/harbison_2004;harbison_2004;8BrentLab/Hackett_2020;hackett_2020;440.0070.0800578glucose30
61BrentLab/harbison_2004;harbison_2004;8BrentLab/Hackett_2020;hackett_2020;460.0000.0625118glucose30
66BrentLab/harbison_2004;harbison_2004;9BrentLab/Hackett_2020;hackett_2020;440.0060.1106849glucose30
68BrentLab/harbison_2004;harbison_2004;9BrentLab/Hackett_2020;hackett_2020;480.0040.3176109glucose30
71BrentLab/harbison_2004;harbison_2004;9BrentLab/Hackett_2020;hackett_2020;460.0010.1176879glucose30
76BrentLab/harbison_2004;harbison_2004;10BrentLab/Hackett_2020;hackett_2020;460.0030.09927210unspecified30
\n", + "
" + ], + "text/plain": [ + " binding_id \\\n", + "12 BrentLab/harbison_2004;harbison_2004;3 \n", + "17 BrentLab/harbison_2004;harbison_2004;3 \n", + "18 BrentLab/harbison_2004;harbison_2004;3 \n", + "50 BrentLab/harbison_2004;harbison_2004;7 \n", + "59 BrentLab/harbison_2004;harbison_2004;8 \n", + "61 BrentLab/harbison_2004;harbison_2004;8 \n", + "66 BrentLab/harbison_2004;harbison_2004;9 \n", + "68 BrentLab/harbison_2004;harbison_2004;9 \n", + "71 BrentLab/harbison_2004;harbison_2004;9 \n", + "76 BrentLab/harbison_2004;harbison_2004;10 \n", + "\n", + " perturbation_id pvalue fdr sample_id \\\n", + "12 BrentLab/Hackett_2020;hackett_2020;85 0.004 0.000225 3 \n", + "17 BrentLab/Hackett_2020;hackett_2020;87 0.010 0.000225 3 \n", + "18 BrentLab/Hackett_2020;hackett_2020;82 0.005 0.000225 3 \n", + "50 BrentLab/Hackett_2020;hackett_2020;44 0.009 0.022495 7 \n", + "59 BrentLab/Hackett_2020;hackett_2020;44 0.007 0.080057 8 \n", + "61 BrentLab/Hackett_2020;hackett_2020;46 0.000 0.062511 8 \n", + "66 BrentLab/Hackett_2020;hackett_2020;44 0.006 0.110684 9 \n", + "68 BrentLab/Hackett_2020;hackett_2020;48 0.004 0.317610 9 \n", + "71 BrentLab/Hackett_2020;hackett_2020;46 0.001 0.117687 9 \n", + "76 BrentLab/Hackett_2020;hackett_2020;46 0.003 0.099272 10 \n", + "\n", + " carbon_source temperature_celsius \n", + "12 glucose 30 \n", + "17 glucose 30 \n", + "18 glucose 30 \n", + "50 glucose 30 \n", + "59 glucose 30 \n", + "61 glucose 30 \n", + "66 glucose 30 \n", + "68 glucose 30 \n", + "71 glucose 30 \n", + "76 unspecified 30 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Example: Query the intersection of harbison and hackett, filter for pvalue <= 0.01\n", + "dto_result = vdb.query_dto(\n", + " binding_dataset=(\"BrentLab/harbison_2004\", \"harbison_2004\"),\n", + " perturbation_dataset=(\"BrentLab/hackett_2020\", \"hackett_2020\"),\n", + " dto_filters={\"pvalue\": (\"<=\", 0.01)},\n", + " fields=[\"binding_id\", \"perturbation_id\", \"pvalue\", \"fdr\",\"sample_id\", \"carbon_source\", \"temperature_celsius\"]\n", + ")\n", + "\n", + "print(f\"Found {len(dto_result)} DTO records\")\n", + "print(f\"Column names: {list(dto_result.columns)}\")\n", + "dto_result.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fetching 135 files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 135/135 [00:00<00:00, 18401.45it/s]\n", + "Query execution failed: Binder Error: Referenced column \"sample_id\" not found in FROM clause!\n", + "Candidate bindings: \"callingcards_enrichment\", \"target_symbol\"\n", + "\n", + "LINE 1: SELECT sample_id FROM metadata_annotated_features LIMIT 1\n", + " ^\n", + "SQL: SELECT sample_id FROM metadata_annotated_features LIMIT 1\n" + ] + }, + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [] + } + ], + "source": [ + "all_p001 = vdb.query( \n", + " datasets=[(\"BrentLab/callingcards\", \"annotated_features\")],\n", + " complete=False\n", + ")\n", + "all_p001.head()\n", + "print(f\"\u603b\u5171\u6709 {len(all_p001)} \u884c\u6570\u636e\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "query in binding or pert,not compara, use vdb function" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/docs/tutorials/show_new_functions.ipynb b/docs/tutorials/show_new_functions.ipynb new file mode 100644 index 0000000..efd8da3 --- /dev/null +++ b/docs/tutorials/show_new_functions.ipynb @@ -0,0 +1,960 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b5f1facc", + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from collections import Counter\n", + "\n", + "# Set display options\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_rows', 100)\n", + "pd.set_option('display.width', None)\n", + "\n", + "# Set plot style\n", + "plt.style.use('seaborn-v0_8-whitegrid')\n", + "sns.set_palette('husl')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5452d8e5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Configuration file saved at: /tmp/tmp9lavjul7/vdb_config.yaml\n" + ] + } + ], + "source": [ + "# Create VirtualDB configuration\n", + "# This configuration defines how to map the fields of different datasets and how to associate DTO comparative analysis data\n", + "\n", + "import tempfile\n", + "from pathlib import Path\n", + "\n", + "config_yaml = \"\"\"\n", + "repositories:\n", + " BrentLab/harbison_2004:\n", + " dataset:\n", + " harbison_2004:\n", + " sample_id:\n", + " field: sample_id\n", + " carbon_source:\n", + " field: condition\n", + " path: media.carbon_source.compound\n", + " temperature_celsius:\n", + " field: condition\n", + " path: temperature_celsius\n", + " dtype: numeric\n", + " environmental_condition:\n", + " field: condition\n", + " regualtor_locus_tag:\n", + " field: regulator_locus_tag\n", + " regulator_symbol:\n", + " field: regulator_symbol\n", + "\n", + " comparative_analyses:\n", + " - repo: BrentLab/yeast_comparative_analysis\n", + " dataset: dto\n", + " via_field: binding_id\n", + "\n", + " BrentLab/rossi_2021:\n", + " carbon_source: \n", + " path: media.carbon_source.compound\n", + " temperature_celsius: \n", + " path: temperature_celsius\n", + " dataset:\n", + " rossi_2021_af_combined:\n", + " sample_id: \n", + " field: sample_id\n", + " regulator_locus_tag:\n", + " field: regulator_locus_tag\n", + " target_locus_tag:\n", + " field: target_locus_tag\n", + "\n", + " comparative_analyses:\n", + " - repo: BrentLab/yeast_comparative_analysis\n", + " dataset: dto\n", + " via_field: binding_id\n", + "\n", + " BrentLab/mahendrawada_2025:\n", + " dataset:\n", + " reprocessed_diffcontrol_5prime:\n", + " sample_id:\n", + " field: sample_id\n", + " control_source:\n", + " field: control_source\n", + " regulator_locus_tag:\n", + " field: regulator_locus_tag\n", + " regulator_symbol:\n", + " field: regulator_symbol\n", + " environmental_condition:\n", + " field: condition\n", + " temperature_celsius:\n", + " field: condition\n", + " path: temperature_celsius\n", + " dtype: numeric\n", + " media_name:\n", + " field: condition\n", + " path: media.name\n", + " carbon_source:\n", + " field: condition\n", + " path: media.carbon_source\n", + "\n", + " comparative_analyses:\n", + " - repo: BrentLab/yeast_comparative_analysis\n", + " dataset: dto\n", + " via_field: binding_id\n", + "\n", + "\n", + " BrentLab/callingcards:\n", + " carbon_source: \n", + " path: media.carbon_source.compound\n", + " temperature_celsius: \n", + " path: temperature_celsius\n", + " dataset:\n", + " annotated_features:\n", + " id:\n", + " field: id\n", + " regulator_locus_tag:\n", + " field: target_locus_tag\n", + " regulator_symbol:\n", + " field: target_symbol\n", + " \n", + " comparative_analyses:\n", + " - repo: BrentLab/yeast_comparative_analysis\n", + " dataset: dto\n", + " via_field: binding_id\n", + " \n", + " BrentLab/hackett_2020:\n", + " dataset:\n", + " hackett_2020:\n", + " sample_id:\n", + " field: sample_id\n", + " dtype: numeric\n", + " regulator_locus_tag:\n", + " field: regulator_locus_tag\n", + " temperature_celsius:\n", + " path: temperature_celsius\n", + " dtype: numeric\n", + " cultivation_method:\n", + " path: cultivation_method\n", + " media_name:\n", + " path: media.name\n", + " induction_system:\n", + " field: mechanism\n", + " inducer_compound:\n", + " field: mechanism\n", + " path: definitions.inducer\n", + " nutrient_restriction:\n", + " field: restriction\n", + " log2fc:\n", + " field: log2_shrunken_timecourses\n", + " dtype: numeric\n", + " log2_raw_ratio:\n", + " field: log2_ratio\n", + " dtype: numeric\n", + " time_point:\n", + " field: time\n", + " dtype: numeric\n", + "\n", + " comparative_analyses:\n", + " - repo: BrentLab/yeast_comparative_analysis\n", + " dataset: dto\n", + " via_field: perturbation_id\n", + "\n", + " BrentLab/kemmeren_2014:\n", + " dataset:\n", + " kemmeren_2014:\n", + " sample_id:\n", + " field: sample_id\n", + " carbon_source:\n", + " path: media.carbon_source.compound\n", + " temperature_celsius:\n", + " path: temperature_celsius\n", + " dtype: numeric\n", + "\n", + " comparative_analyses:\n", + " - repo: BrentLab/yeast_comparative_analysis\n", + " dataset: dto\n", + " via_field: perturbation_id\n", + "\n", + " BrentLab/yeast_comparative_analysis:\n", + " dataset:\n", + " dto:\n", + " binding_id:\n", + " field: binding_id\n", + " perturbation_id:\n", + " field: perturbation_id\n", + " fdr:\n", + " field: dto_fdr\n", + " dtype: numeric\n", + " pvalue:\n", + " field: dto_empirical_pvalue\n", + " dtype: numeric\n", + " binding_threshold:\n", + " field: binding_rank_threshold\n", + " dtype: numeric\n", + " perturbation_threshold:\n", + " field: perturbation_rank_threshold\n", + " dtype: numeric\n", + " binding_set_size:\n", + " field: binding_set_size\n", + " dtype: numeric\n", + " perturbation_set_size:\n", + " field: perturbation_set_size\n", + " dtype: numeric\n", + "\n", + "factor_aliases:\n", + " carbon_source:\n", + " glucose: [D-glucose, dextrose, glu]\n", + " galactose: [D-galactose, gal]\n", + " raffinose: [D-raffinose]\n", + "\n", + "missing_value_labels:\n", + " carbon_source: \"unspecified\"\n", + "\n", + "description:\n", + " carbon_source: The carbon source provided during growth\n", + " temperature_celsius: Growth temperature in degrees Celsius\n", + " environmental_condition: Named environmental condition\n", + "\"\"\"\n", + "\n", + "# Save the configuration to a temporary file\n", + "temp_config = Path(tempfile.mkdtemp()) / \"vdb_config.yaml\"\n", + "temp_config.write_text(config_yaml)\n", + "\n", + "print(f\"✅ Configuration file saved at: {temp_config}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1550d737", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ VirtualDB initialized successfully!\n", + "Number of configured repositories: 7\n", + "\n", + "Configured datasets:\n", + " - BrentLab/harbison_2004/harbison_2004\n", + " - BrentLab/rossi_2021/rossi_2021_af_combined\n", + " - BrentLab/mahendrawada_2025/reprocessed_diffcontrol_5prime\n", + " - BrentLab/callingcards/annotated_features\n", + " - BrentLab/hackett_2020/hackett_2020\n", + " - BrentLab/kemmeren_2014/kemmeren_2014\n", + " - BrentLab/yeast_comparative_analysis/dto\n" + ] + } + ], + "source": [ + "# Initialize VirtualDB\n", + "import os\n", + "from tfbpapi.virtual_db import VirtualDB\n", + "\n", + "# Token authentication required\n", + "hf_token = os.getenv(\"HF_TOKEN\", None)\n", + "\n", + "vdb = VirtualDB(str(temp_config), token=hf_token)\n", + "\n", + "print(\"✅ VirtualDB initialized successfully!\")\n", + "print(f\"Number of configured repositories: {len(vdb.config.repositories)}\")\n", + "\n", + "# List all configured datasets\n", + "print(\"\\nConfigured datasets:\")\n", + "for repo_id, repo_config in vdb.config.repositories.items():\n", + " if repo_config.dataset:\n", + " for config_name in repo_config.dataset.keys():\n", + " print(f\" - {repo_id}/{config_name}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "e6dc4ce8", + "metadata": {}, + "source": [ + "Added the ability to perform comparative analysis to the query." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c9b1b241", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 41665.27it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idcarbon_sourcetemperature_celsiuspvalueperturbation_iddataset_id
01glucose30NaNNaNBrentLab/harbison_2004/harbison_2004
12glucose300.301BrentLab/kemmeren_2014;kemmeren_2014;18BrentLab/harbison_2004/harbison_2004
22glucose30NaNBrentLab/Hackett_2020;hackett_2020;33BrentLab/harbison_2004/harbison_2004
32glucose300.512BrentLab/Hackett_2020;hackett_2020;34BrentLab/harbison_2004/harbison_2004
42glucose300.306BrentLab/Hackett_2020;hackett_2020;40BrentLab/harbison_2004/harbison_2004
52glucose30NaNBrentLab/Hackett_2020;hackett_2020;37BrentLab/harbison_2004/harbison_2004
62glucose300.309BrentLab/Hackett_2020;hackett_2020;38BrentLab/harbison_2004/harbison_2004
72glucose300.644BrentLab/Hackett_2020;hackett_2020;36BrentLab/harbison_2004/harbison_2004
82glucose300.411BrentLab/Hackett_2020;hackett_2020;35BrentLab/harbison_2004/harbison_2004
92glucose300.536BrentLab/Hackett_2020;hackett_2020;39BrentLab/harbison_2004/harbison_2004
\n", + "
" + ], + "text/plain": [ + " sample_id carbon_source temperature_celsius pvalue \\\n", + "0 1 glucose 30 NaN \n", + "1 2 glucose 30 0.301 \n", + "2 2 glucose 30 NaN \n", + "3 2 glucose 30 0.512 \n", + "4 2 glucose 30 0.306 \n", + "5 2 glucose 30 NaN \n", + "6 2 glucose 30 0.309 \n", + "7 2 glucose 30 0.644 \n", + "8 2 glucose 30 0.411 \n", + "9 2 glucose 30 0.536 \n", + "\n", + " perturbation_id \\\n", + "0 NaN \n", + "1 BrentLab/kemmeren_2014;kemmeren_2014;18 \n", + "2 BrentLab/Hackett_2020;hackett_2020;33 \n", + "3 BrentLab/Hackett_2020;hackett_2020;34 \n", + "4 BrentLab/Hackett_2020;hackett_2020;40 \n", + "5 BrentLab/Hackett_2020;hackett_2020;37 \n", + "6 BrentLab/Hackett_2020;hackett_2020;38 \n", + "7 BrentLab/Hackett_2020;hackett_2020;36 \n", + "8 BrentLab/Hackett_2020;hackett_2020;35 \n", + "9 BrentLab/Hackett_2020;hackett_2020;39 \n", + "\n", + " dataset_id \n", + "0 BrentLab/harbison_2004/harbison_2004 \n", + "1 BrentLab/harbison_2004/harbison_2004 \n", + "2 BrentLab/harbison_2004/harbison_2004 \n", + "3 BrentLab/harbison_2004/harbison_2004 \n", + "4 BrentLab/harbison_2004/harbison_2004 \n", + "5 BrentLab/harbison_2004/harbison_2004 \n", + "6 BrentLab/harbison_2004/harbison_2004 \n", + "7 BrentLab/harbison_2004/harbison_2004 \n", + "8 BrentLab/harbison_2004/harbison_2004 \n", + "9 BrentLab/harbison_2004/harbison_2004 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_p001 = vdb.query(\n", + " datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n", + " fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\", \"pvalue\", \"perturbation_id\"],\n", + ")\n", + "\n", + "all_p001.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f7f6a7f4", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 37729.87it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idtemperature_celsiusperturbation_idbinding_idpvaluedataset_id
03430BrentLab/Hackett_2020;hackett_2020;34BrentLab/callingcards;annotated_features;3940.010BrentLab/hackett_2020/hackett_2020
13830BrentLab/Hackett_2020;hackett_2020;38BrentLab/callingcards;annotated_features;3800.003BrentLab/hackett_2020/hackett_2020
23930BrentLab/Hackett_2020;hackett_2020;39BrentLab/callingcards;annotated_features;3800.000BrentLab/hackett_2020/hackett_2020
33930BrentLab/Hackett_2020;hackett_2020;39BrentLab/callingcards;annotated_features;7480.000BrentLab/hackett_2020/hackett_2020
44030BrentLab/Hackett_2020;hackett_2020;40BrentLab/callingcards;annotated_features;3800.000BrentLab/hackett_2020/hackett_2020
54030BrentLab/Hackett_2020;hackett_2020;40BrentLab/callingcards;annotated_features;3940.000BrentLab/hackett_2020/hackett_2020
64030BrentLab/Hackett_2020;hackett_2020;40BrentLab/callingcards;annotated_features;7480.000BrentLab/hackett_2020/hackett_2020
74430BrentLab/Hackett_2020;hackett_2020;44BrentLab/callingcards;annotated_features;340.000BrentLab/hackett_2020/hackett_2020
84430BrentLab/Hackett_2020;hackett_2020;44BrentLab/harbison_2004;harbison_2004;70.009BrentLab/hackett_2020/hackett_2020
94430BrentLab/Hackett_2020;hackett_2020;44BrentLab/harbison_2004;harbison_2004;80.007BrentLab/hackett_2020/hackett_2020
\n", + "
" + ], + "text/plain": [ + " sample_id temperature_celsius perturbation_id \\\n", + "0 34 30 BrentLab/Hackett_2020;hackett_2020;34 \n", + "1 38 30 BrentLab/Hackett_2020;hackett_2020;38 \n", + "2 39 30 BrentLab/Hackett_2020;hackett_2020;39 \n", + "3 39 30 BrentLab/Hackett_2020;hackett_2020;39 \n", + "4 40 30 BrentLab/Hackett_2020;hackett_2020;40 \n", + "5 40 30 BrentLab/Hackett_2020;hackett_2020;40 \n", + "6 40 30 BrentLab/Hackett_2020;hackett_2020;40 \n", + "7 44 30 BrentLab/Hackett_2020;hackett_2020;44 \n", + "8 44 30 BrentLab/Hackett_2020;hackett_2020;44 \n", + "9 44 30 BrentLab/Hackett_2020;hackett_2020;44 \n", + "\n", + " binding_id pvalue \\\n", + "0 BrentLab/callingcards;annotated_features;394 0.010 \n", + "1 BrentLab/callingcards;annotated_features;380 0.003 \n", + "2 BrentLab/callingcards;annotated_features;380 0.000 \n", + "3 BrentLab/callingcards;annotated_features;748 0.000 \n", + "4 BrentLab/callingcards;annotated_features;380 0.000 \n", + "5 BrentLab/callingcards;annotated_features;394 0.000 \n", + "6 BrentLab/callingcards;annotated_features;748 0.000 \n", + "7 BrentLab/callingcards;annotated_features;34 0.000 \n", + "8 BrentLab/harbison_2004;harbison_2004;7 0.009 \n", + "9 BrentLab/harbison_2004;harbison_2004;8 0.007 \n", + "\n", + " dataset_id \n", + "0 BrentLab/hackett_2020/hackett_2020 \n", + "1 BrentLab/hackett_2020/hackett_2020 \n", + "2 BrentLab/hackett_2020/hackett_2020 \n", + "3 BrentLab/hackett_2020/hackett_2020 \n", + "4 BrentLab/hackett_2020/hackett_2020 \n", + "5 BrentLab/hackett_2020/hackett_2020 \n", + "6 BrentLab/hackett_2020/hackett_2020 \n", + "7 BrentLab/hackett_2020/hackett_2020 \n", + "8 BrentLab/hackett_2020/hackett_2020 \n", + "9 BrentLab/hackett_2020/hackett_2020 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Even if not specified in the `fields` parameter, the filter options will still be retained within the `fields` parameter.\n", + "all_p001 = vdb.query(\n", + " datasets=[(\"BrentLab/hackett_2020\", \"hackett_2020\")],\n", + " filters={\n", + " \"pvalue\": (\"<=\", 0.01) \n", + " },\n", + " fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\", \"perturbation_id\",\"binding_id\"],\n", + ")\n", + "\n", + "all_p001.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "5bd97850", + "metadata": {}, + "source": [ + "Create a function called query_dto that is specifically responsible for retrieving the DTO data for the specified binding and perturbation datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "20864108", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 126 DTO records\n", + "Column names: ['binding_id', 'perturbation_id', 'pvalue', 'fdr', 'sample_id', 'carbon_source', 'temperature_celsius']\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
binding_idperturbation_idpvaluefdrsample_idcarbon_sourcetemperature_celsius
12BrentLab/harbison_2004;harbison_2004;3BrentLab/Hackett_2020;hackett_2020;850.0040.0002253glucose30
17BrentLab/harbison_2004;harbison_2004;3BrentLab/Hackett_2020;hackett_2020;870.0100.0002253glucose30
18BrentLab/harbison_2004;harbison_2004;3BrentLab/Hackett_2020;hackett_2020;820.0050.0002253glucose30
50BrentLab/harbison_2004;harbison_2004;7BrentLab/Hackett_2020;hackett_2020;440.0090.0224957glucose30
59BrentLab/harbison_2004;harbison_2004;8BrentLab/Hackett_2020;hackett_2020;440.0070.0800578glucose30
61BrentLab/harbison_2004;harbison_2004;8BrentLab/Hackett_2020;hackett_2020;460.0000.0625118glucose30
66BrentLab/harbison_2004;harbison_2004;9BrentLab/Hackett_2020;hackett_2020;440.0060.1106849glucose30
68BrentLab/harbison_2004;harbison_2004;9BrentLab/Hackett_2020;hackett_2020;480.0040.3176109glucose30
71BrentLab/harbison_2004;harbison_2004;9BrentLab/Hackett_2020;hackett_2020;460.0010.1176879glucose30
76BrentLab/harbison_2004;harbison_2004;10BrentLab/Hackett_2020;hackett_2020;460.0030.09927210unspecified30
\n", + "
" + ], + "text/plain": [ + " binding_id \\\n", + "12 BrentLab/harbison_2004;harbison_2004;3 \n", + "17 BrentLab/harbison_2004;harbison_2004;3 \n", + "18 BrentLab/harbison_2004;harbison_2004;3 \n", + "50 BrentLab/harbison_2004;harbison_2004;7 \n", + "59 BrentLab/harbison_2004;harbison_2004;8 \n", + "61 BrentLab/harbison_2004;harbison_2004;8 \n", + "66 BrentLab/harbison_2004;harbison_2004;9 \n", + "68 BrentLab/harbison_2004;harbison_2004;9 \n", + "71 BrentLab/harbison_2004;harbison_2004;9 \n", + "76 BrentLab/harbison_2004;harbison_2004;10 \n", + "\n", + " perturbation_id pvalue fdr sample_id \\\n", + "12 BrentLab/Hackett_2020;hackett_2020;85 0.004 0.000225 3 \n", + "17 BrentLab/Hackett_2020;hackett_2020;87 0.010 0.000225 3 \n", + "18 BrentLab/Hackett_2020;hackett_2020;82 0.005 0.000225 3 \n", + "50 BrentLab/Hackett_2020;hackett_2020;44 0.009 0.022495 7 \n", + "59 BrentLab/Hackett_2020;hackett_2020;44 0.007 0.080057 8 \n", + "61 BrentLab/Hackett_2020;hackett_2020;46 0.000 0.062511 8 \n", + "66 BrentLab/Hackett_2020;hackett_2020;44 0.006 0.110684 9 \n", + "68 BrentLab/Hackett_2020;hackett_2020;48 0.004 0.317610 9 \n", + "71 BrentLab/Hackett_2020;hackett_2020;46 0.001 0.117687 9 \n", + "76 BrentLab/Hackett_2020;hackett_2020;46 0.003 0.099272 10 \n", + "\n", + " carbon_source temperature_celsius \n", + "12 glucose 30 \n", + "17 glucose 30 \n", + "18 glucose 30 \n", + "50 glucose 30 \n", + "59 glucose 30 \n", + "61 glucose 30 \n", + "66 glucose 30 \n", + "68 glucose 30 \n", + "71 glucose 30 \n", + "76 unspecified 30 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Example: Query the intersection of harbison and hackett, filter for pvalue <= 0.01\n", + "dto_result = vdb.query_dto(\n", + " binding_dataset=(\"BrentLab/harbison_2004\", \"harbison_2004\"),\n", + " perturbation_dataset=(\"BrentLab/hackett_2020\", \"hackett_2020\"),\n", + " dto_filters={\"pvalue\": (\"<=\", 0.01)},\n", + " fields=[\"binding_id\", \"perturbation_id\", \"pvalue\", \"fdr\",\"sample_id\", \"carbon_source\", \"temperature_celsius\"]\n", + ")\n", + "\n", + "print(f\"Found {len(dto_result)} DTO records\")\n", + "print(f\"Column names: {list(dto_result.columns)}\")\n", + "dto_result.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "15f63f8a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Query execution failed: Binder Error: Referenced column \"sample_id\" not found in FROM clause!\n", + "Candidate bindings: \"callingcards_enrichment\", \"target_symbol\"\n", + "\n", + "LINE 1: SELECT sample_id FROM metadata_annotated_features LIMIT 1\n", + " ^\n", + "SQL: SELECT sample_id FROM metadata_annotated_features LIMIT 1\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m all_p001 = \u001b[43mvdb\u001b[49m\u001b[43m.\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\n\u001b[32m 2\u001b[39m \u001b[43m \u001b[49m\u001b[43mdatasets\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mBrentLab/callingcards\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mannotated_features\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[43m \u001b[49m\u001b[43mcomplete\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[32m 4\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m 5\u001b[39m all_p001.head()\n\u001b[32m 6\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mTotal number of rows: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(all_p001)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/tfbpapi/virtual_db.py:406\u001b[39m, in \u001b[36mVirtualDB.query\u001b[39m\u001b[34m(self, filters, datasets, fields, complete)\u001b[39m\n\u001b[32m 403\u001b[39m results: \u001b[38;5;28mlist\u001b[39m[pd.DataFrame] = []\n\u001b[32m 404\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m repo_id, config_name \u001b[38;5;129;01min\u001b[39;00m datasets:\n\u001b[32m 405\u001b[39m \u001b[38;5;66;03m# Build metadata table\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m406\u001b[39m metadata_df = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_build_metadata_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 407\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m metadata_df.empty:\n\u001b[32m 408\u001b[39m \u001b[38;5;28;01mcontinue\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/tfbpapi/virtual_db.py:690\u001b[39m, in \u001b[36mVirtualDB._build_metadata_table\u001b[39m\u001b[34m(self, repo_id, config_name, use_cache)\u001b[39m\n\u001b[32m 688\u001b[39m \u001b[38;5;66;03m# If sample_id doesn't exist, generate from row number\u001b[39;00m\n\u001b[32m 689\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33msample_id\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m df.columns \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m df.empty:\n\u001b[32m--> \u001b[39m\u001b[32m690\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33msample_id\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[43mdf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m.\u001b[49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 692\u001b[39m \u001b[38;5;66;03m# One row per sample_id\u001b[39;00m\n\u001b[32m 693\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33msample_id\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m df.columns:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/.venv/lib/python3.11/site-packages/pandas/core/indexes/base.py:1104\u001b[39m, in \u001b[36mIndex.astype\u001b[39m\u001b[34m(self, dtype, copy)\u001b[39m\n\u001b[32m 1100\u001b[39m new_values = \u001b[38;5;28mcls\u001b[39m._from_sequence(\u001b[38;5;28mself\u001b[39m, dtype=dtype, copy=copy)\n\u001b[32m 1102\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1103\u001b[39m \u001b[38;5;66;03m# GH#13149 specifically use astype_array instead of astype\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1104\u001b[39m new_values = \u001b[43mastype_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1106\u001b[39m \u001b[38;5;66;03m# pass copy=False because any copying will be done in the astype above\u001b[39;00m\n\u001b[32m 1107\u001b[39m result = Index(new_values, name=\u001b[38;5;28mself\u001b[39m.name, dtype=new_values.dtype, copy=\u001b[38;5;28;01mFalse\u001b[39;00m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:182\u001b[39m, in \u001b[36mastype_array\u001b[39m\u001b[34m(values, dtype, copy)\u001b[39m\n\u001b[32m 179\u001b[39m values = values.astype(dtype, copy=copy)\n\u001b[32m 181\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m182\u001b[39m values = \u001b[43m_astype_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 184\u001b[39m \u001b[38;5;66;03m# in pandas we don't store numpy str dtypes, so convert to object\u001b[39;00m\n\u001b[32m 185\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dtype, np.dtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values.dtype.type, \u001b[38;5;28mstr\u001b[39m):\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/code/tfbpapi/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:96\u001b[39m, in \u001b[36m_astype_nansafe\u001b[39m\u001b[34m(arr, dtype, copy, skipna)\u001b[39m\n\u001b[32m 94\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m arr.ndim > \u001b[32m1\u001b[39m:\n\u001b[32m 95\u001b[39m arr = arr.ravel()\n\u001b[32m---> \u001b[39m\u001b[32m96\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[43m.\u001b[49m\u001b[43mensure_string_array\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 97\u001b[39m \u001b[43m \u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m=\u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert_na_value\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[32m 98\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m.reshape(shape)\n\u001b[32m 100\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m np.issubdtype(arr.dtype, np.floating) \u001b[38;5;129;01mand\u001b[39;00m dtype.kind \u001b[38;5;129;01min\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33miu\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 101\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m _astype_float_to_int_nansafe(arr, dtype, copy)\n", + "\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/lib.pyx:718\u001b[39m, in \u001b[36mpandas._libs.lib.ensure_string_array\u001b[39m\u001b[34m()\u001b[39m\n", + "\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/lib.pyx:832\u001b[39m, in \u001b[36mpandas._libs.lib.ensure_string_array\u001b[39m\u001b[34m()\u001b[39m\n", + "\u001b[31mKeyboardInterrupt\u001b[39m: " + ] + } + ], + "source": [ + "all_p001 = vdb.query( \n", + " datasets=[(\"BrentLab/callingcards\", \"annotated_features\")],\n", + " complete=False\n", + ")\n", + "all_p001.head()\n", + "print(f\"Total number of rows: {len(all_p001)}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tfbpapi/tests/test_virtual_db.py b/tfbpapi/tests/test_virtual_db.py index 1293bf9..f0abf1c 100644 --- a/tfbpapi/tests/test_virtual_db.py +++ b/tfbpapi/tests/test_virtual_db.py @@ -509,185 +509,6 @@ def test_parse_composite_identifier_invalid(self): with pytest.raises(ValueError, match="Invalid composite ID format"): VirtualDB._parse_composite_identifier("invalid:format") - def test_get_comparative_fields_for_dataset(self): - """Test getting comparative fields mapping.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - config = { - "repositories": { - "BrentLab/primary": { - "dataset": { - "primary_data": { - "sample_id": {"field": "sample_id"}, - "comparative_analyses": [ - { - "repo": "BrentLab/comparative", - "dataset": "comp_data", - "via_field": "binding_id", - } - ], - } - } - }, - "BrentLab/comparative": { - "dataset": { - "comp_data": { - "dto_fdr": {"field": "dto_fdr"}, - "dto_pvalue": {"field": "dto_empirical_pvalue"}, - } - } - }, - } - } - yaml.dump(config, f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - field_mapping = vdb._get_comparative_fields_for_dataset( - "BrentLab/primary", "primary_data" - ) - - # Should have dto_fdr and dto_pvalue, but NOT binding_id (via_field) - assert "dto_fdr" in field_mapping - assert "dto_pvalue" in field_mapping - assert "binding_id" not in field_mapping - - # Check mapping structure - assert field_mapping["dto_fdr"]["comp_repo"] == "BrentLab/comparative" - assert field_mapping["dto_fdr"]["comp_dataset"] == "comp_data" - assert field_mapping["dto_fdr"]["via_field"] == "binding_id" - finally: - Path(config_path).unlink() - - def test_get_comparative_fields_no_links(self): - """Test that datasets without comparative links return empty mapping.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - config = { - "repositories": { - "BrentLab/primary": { - "dataset": { - "primary_data": {"sample_id": {"field": "sample_id"}} - } - } - } - } - yaml.dump(config, f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - field_mapping = vdb._get_comparative_fields_for_dataset( - "BrentLab/primary", "primary_data" - ) - assert field_mapping == {} - finally: - Path(config_path).unlink() - - def test_get_comparative_analyses(self): - """Test getting comparative analysis relationships.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - config = { - "repositories": { - "BrentLab/primary": { - "dataset": { - "primary_data": { - "sample_id": {"field": "sample_id"}, - "comparative_analyses": [ - { - "repo": "BrentLab/comparative", - "dataset": "comp_data", - "via_field": "binding_id", - } - ], - } - } - }, - "BrentLab/comparative": { - "dataset": {"comp_data": {"dto_fdr": {"field": "dto_fdr"}}} - }, - } - } - yaml.dump(config, f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - info = vdb.get_comparative_analyses() - - # Check primary to comparative mapping - assert "BrentLab/primary/primary_data" in info["primary_to_comparative"] - links = info["primary_to_comparative"]["BrentLab/primary/primary_data"] - assert len(links) == 1 - assert links[0]["comparative_repo"] == "BrentLab/comparative" - assert links[0]["comparative_dataset"] == "comp_data" - assert links[0]["via_field"] == "binding_id" - - # Check comparative fields - assert "BrentLab/comparative/comp_data" in info["comparative_fields"] - assert ( - "dto_fdr" - in info["comparative_fields"]["BrentLab/comparative/comp_data"] - ) - finally: - Path(config_path).unlink() - - def test_get_comparative_analyses_filtered(self): - """Test filtering comparative analyses by repo and config.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - config = { - "repositories": { - "BrentLab/primary1": { - "dataset": { - "data1": { - "sample_id": {"field": "sample_id"}, - "comparative_analyses": [ - { - "repo": "BrentLab/comp", - "dataset": "comp_data", - "via_field": "id1", - } - ], - } - } - }, - "BrentLab/primary2": { - "dataset": { - "data2": { - "sample_id": {"field": "sample_id"}, - "comparative_analyses": [ - { - "repo": "BrentLab/comp", - "dataset": "comp_data", - "via_field": "id2", - } - ], - } - } - }, - } - } - yaml.dump(config, f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - - # Get all - all_info = vdb.get_comparative_analyses() - assert len(all_info["primary_to_comparative"]) == 2 - - # Filter by repo and config - filtered = vdb.get_comparative_analyses("BrentLab/primary1", "data1") - assert len(filtered["primary_to_comparative"]) == 1 - assert "BrentLab/primary1/data1" in filtered["primary_to_comparative"] - - # Filter by repo only - repo_filtered = vdb.get_comparative_analyses("BrentLab/primary2") - assert len(repo_filtered["primary_to_comparative"]) == 1 - assert "BrentLab/primary2/data2" in repo_filtered["primary_to_comparative"] - finally: - Path(config_path).unlink() - # Note: Full integration tests with real HuggingFace datasets would go here # but are excluded as they require network access and specific test datasets. diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py index f6dd12e..80992b5 100644 --- a/tfbpapi/virtual_db.py +++ b/tfbpapi/virtual_db.py @@ -64,9 +64,15 @@ def get_nested_value(data: dict, path: str) -> Any: List of dicts - extract property from each item: get_nested_value( - {"media": {"carbon_source": [{"compound": "glucose"}, - {"compound": "galactose"}]}}, - "media.carbon_source.compound" + { + "media": { + "carbon_source": [ + {"compound": "glucose"}, + {"compound": "galactose"}, + ] + } + }, + "media.carbon_source.compound", ) Returns: ["glucose", "galactose"] @@ -193,8 +199,6 @@ def __init__(self, config_path: Path | str, token: str | None = None): self.config = MetadataConfig.from_yaml(config_path) self.token = token self.cache: dict[tuple[str, str], pd.DataFrame] = {} - # Build mapping of comparative dataset references - self._comparative_links = self._build_comparative_links() def get_fields( self, repo_id: str | None = None, config_name: str | None = None @@ -202,6 +206,8 @@ def get_fields( """ Get list of queryable fields. + Includes fields from comparative analyses if configured. + :param repo_id: Optional repository ID to filter to specific dataset :param config_name: Optional config name (required if repo_id provided) :return: List of field names @@ -217,7 +223,23 @@ def get_fields( if repo_id is not None and config_name is not None: # Get fields for specific dataset mappings = self.config.get_property_mappings(repo_id, config_name) - return sorted(mappings.keys()) + fields = set(mappings.keys()) + + # Add fields from comparative analyses + repo_config = self.config.get_repository_config(repo_id) + if repo_config and repo_config.dataset: + dataset_config = repo_config.dataset.get(config_name) + if dataset_config and dataset_config.comparative_analyses: + for comp_analysis in dataset_config.comparative_analyses: + comp_mappings = self.config.get_property_mappings( + comp_analysis.repo, comp_analysis.dataset + ) + # Add comparative fields (exclude via_field) + for field in comp_mappings.keys(): + if field != comp_analysis.via_field: + fields.add(field) + + return sorted(fields) if repo_id is not None or config_name is not None: raise ValueError( @@ -231,16 +253,21 @@ def get_fields( all_fields.update(repo_config.properties.keys()) # Add dataset-specific fields if repo_config.dataset: - for dataset_config in repo_config.dataset.values(): - # DatasetVirtualDBConfig stores property mappings in model_extra - if ( - hasattr(dataset_config, "model_extra") - and dataset_config.model_extra - ): - all_fields.update(dataset_config.model_extra.keys()) - # Also include special fields if they exist - if dataset_config.sample_id: - all_fields.add("sample_id") + for config_name, dataset_config in repo_config.dataset.items(): + # Get property mappings (excludes comparative_analyses) + mappings = self.config.get_property_mappings(repo_id, config_name) + all_fields.update(mappings.keys()) + + # Add fields from comparative analyses + if dataset_config.comparative_analyses: + for comp_analysis in dataset_config.comparative_analyses: + comp_mappings = self.config.get_property_mappings( + comp_analysis.repo, comp_analysis.dataset + ) + # Add comparative fields (exclude via_field) + for field in comp_mappings.keys(): + if field != comp_analysis.via_field: + all_fields.add(field) return sorted(all_fields) @@ -326,101 +353,6 @@ def get_unique_values( else: return sorted(all_values) - def get_comparative_analyses( - self, repo_id: str | None = None, config_name: str | None = None - ) -> dict[str, Any]: - """ - Get information about comparative analysis relationships. - - Returns information about which comparative datasets are available - and how they link to primary datasets. Useful for discovering - what cross-dataset analyses can be performed. - - :param repo_id: Optional repository ID to filter to specific repo - :param config_name: Optional config name (requires repo_id) - :return: Dictionary with two keys: - - "primary_to_comparative": Maps primary datasets to their - comparative analyses - - "comparative_fields": Maps comparative datasets to fields - available for joining - :raises ValueError: If config_name provided without repo_id - - Examples: - Get all comparative analysis relationships: - info = vdb.get_comparative_analyses() - - Get relationships for specific primary dataset: - info = vdb.get_comparative_analyses( - "BrentLab/callingcards", "annotated_features" - ) - - """ - if config_name and not repo_id: - raise ValueError("repo_id required when config_name is specified") - - primary_to_comparative: dict[str, list[dict[str, str]]] = {} - comparative_fields: dict[str, list[str]] = {} - - # Filter links based on parameters - if repo_id and config_name: - # Specific dataset requested - links_to_process = { - (repo_id, config_name): self._comparative_links.get( - (repo_id, config_name), {} - ) - } - elif repo_id: - # All configs in specific repo - links_to_process = { - k: v for k, v in self._comparative_links.items() if k[0] == repo_id - } - else: - # All links - links_to_process = self._comparative_links - - # Build primary to comparative mapping - for (prim_repo, prim_config), link_info in links_to_process.items(): - if "comparative_analyses" not in link_info: - continue - - dataset_key = f"{prim_repo}/{prim_config}" - primary_to_comparative[dataset_key] = [] - - for ca in link_info["comparative_analyses"]: - primary_to_comparative[dataset_key].append( - { - "comparative_repo": ca["repo"], - "comparative_dataset": ca["dataset"], - "via_field": ca["via_field"], - } - ) - - # Track which fields are available from comparative datasets - comp_key = f"{ca['repo']}/{ca['dataset']}" - if comp_key not in comparative_fields: - # Get fields from the comparative dataset - # First try config mappings - comp_fields = self.get_fields(ca["repo"], ca["dataset"]) - - # If no mappings, get actual fields from DataCard - if not comp_fields: - try: - card = DataCard(ca["repo"], token=self.token) - config = card.get_config(ca["dataset"]) - if config and config.dataset_info: - comp_fields = [ - f.name for f in config.dataset_info.features - ] - except Exception: - comp_fields = [] - - comparative_fields[comp_key] = comp_fields - - return { - "primary_to_comparative": primary_to_comparative, - "comparative_fields": comparative_fields, - } - def query( self, filters: dict[str, Any] | None = None, @@ -475,59 +407,13 @@ def query( if metadata_df.empty: continue - # Separate filters into primary and comparative - primary_filters = {} - comparative_filters = {} + # Apply filters if filters: - # Get comparative field mapping - comp_field_mapping = self._get_comparative_fields_for_dataset( - repo_id, config_name - ) - for field, value in filters.items(): - if field in comp_field_mapping: - comparative_filters[field] = value - else: - primary_filters[field] = value - - # Apply primary filters first - if primary_filters: - metadata_df = self._apply_filters( - metadata_df, primary_filters, repo_id, config_name - ) - - # Enrich with comparative data if needed - # IMPORTANT: Do this BEFORE getting complete data so comparative fields - # are joined at the sample level, not measurement level - # This happens when: fields are requested from comparative datasets - # OR when filtering on comparative fields - if fields or comparative_filters: - comp_field_mapping = self._get_comparative_fields_for_dataset( - repo_id, config_name - ) - if fields: - requested_comp_fields = [ - f for f in fields if f in comp_field_mapping - ] - # Also need fields that are filtered on - filtered_comp_fields = [ - f for f in comparative_filters.keys() if f in comp_field_mapping - ] - all_comp_fields = list( - set(requested_comp_fields + filtered_comp_fields) - ) - if all_comp_fields: - metadata_df = self._enrich_with_comparative_data( - metadata_df, repo_id, config_name, all_comp_fields - ) - - # Apply comparative filters after enrichment - if comparative_filters: metadata_df = self._apply_filters( - metadata_df, comparative_filters, repo_id, config_name + metadata_df, filters, repo_id, config_name ) # If complete=True, join with full data - # Do this AFTER comparative enrichment so DTO fields are already added if complete: sample_ids = metadata_df["sample_id"].tolist() if sample_ids: @@ -547,9 +433,21 @@ def query( for field in fields: if field in metadata_df.columns and field not in keep_cols: keep_cols.append(field) + + # IMPORTANT: Also include fields used in filters + # This ensures that filtered fields are always returned, + # even if not in fields parameter + if filters: + for filter_field in filters.keys(): + if ( + filter_field in metadata_df.columns + and filter_field not in keep_cols + ): + keep_cols.append(filter_field) + metadata_df = metadata_df[keep_cols].copy() - # Add dataset identifier + # Add dataset identifier (ensure copy before modifying) if "dataset_id" not in metadata_df.columns: metadata_df = metadata_df.copy() metadata_df["dataset_id"] = f"{repo_id}/{config_name}" @@ -562,6 +460,117 @@ def query( # Concatenate results, filling NaN for missing columns return pd.concat(results, ignore_index=True, sort=False) + def query_dto( + self, + binding_dataset: tuple[str, str], + perturbation_dataset: tuple[str, str], + binding_filters: dict[str, Any] | None = None, + perturbation_filters: dict[str, Any] | None = None, + dto_filters: dict[str, Any] | None = None, + fields: list[str] | None = None, + ) -> pd.DataFrame: + """ + Query dto data filtered by binding and perturbation datasets. + + This method uses the existing query() function to get binding dataset data + (which automatically includes DTO fields via comparative_analyses join), + then filters by perturbation_id using pandas. + + :param binding_dataset: (repo_id, config_name) for binding_id source + :param perturbation_dataset: (repo_id, config_name) for perturbation_id source + :param binding_filters: Filters to apply on binding dataset + :param perturbation_filters: Filters to apply on perturbation dataset + :param dto_filters: Filters on DTO fields + : (e.g., {"pvalue": ("<=", 0.01)}) + :param fields: Fields to return (None = all fields including DTO fields) + :return: DataFrame with matching DTO records + + Examples: + # Basic usage: query DTO intersection + result = vdb.query_dto( + binding_dataset=("BrentLab/harbison_2004", "harbison_2004"), + perturbation_dataset=("BrentLab/hackett_2020", "hackett_2020"), + dto_filters={"pvalue": ("<=", 0.01)} + ) + + # With source dataset filters + result = vdb.query_dto( + binding_dataset=("BrentLab/harbison_2004", "harbison_2004"), + perturbation_dataset=("BrentLab/hackett_2020", "hackett_2020"), + binding_filters={"carbon_source": "glucose"}, + perturbation_filters={"temperature_celsius": 30}, + dto_filters={"pvalue": ("<=", 0.01), "fdr": ("<=", 0.05)}, + fields=["sample_id", "binding_id", "perturbation_id", "pvalue", "fdr"] + ) + + """ + # Step 1: Query binding dataset + binding_df = self.query( + datasets=[binding_dataset], + filters=binding_filters, + fields=fields, # If fields specified, query will handle it + ) + + if binding_df.empty: + return pd.DataFrame() + + # Check if perturbation_id column exists (from DTO join) + if "perturbation_id" not in binding_df.columns: + # No DTO data joined, return empty + return pd.DataFrame() + + # Step 2: Query perturbation dataset to get sample_ids + perturbation_repo, perturbation_config = perturbation_dataset + perturbation_df = self.query( + datasets=[perturbation_dataset], + filters=perturbation_filters, + fields=["sample_id"], + ) + + if perturbation_df.empty or "sample_id" not in perturbation_df.columns: + return pd.DataFrame() + + # Step 3: Build composite IDs for perturbation dataset (with case variants) + perturbation_ids = set() + for sample_id in perturbation_df["sample_id"].astype(str).unique(): + # Original format + perturbation_ids.add( + f"{perturbation_repo};{perturbation_config};{sample_id}" + ) + # Capitalized variant (e.g., hackett_2020 -> Hackett_2020) + if "/" in perturbation_repo: + parts = perturbation_repo.split("/", 1) + if len(parts) == 2 and parts[1]: + alt_repo = f"{parts[0]}/{parts[1][0].upper()}{parts[1][1:]}" + perturbation_ids.add( + f"{alt_repo};{perturbation_config};{sample_id}" + ) + + # Step 4: Filter binding_df to only keep rows where perturbation_id matches + # Handle NaN values in perturbation_id + result_df = binding_df[ + binding_df["perturbation_id"].isin(perturbation_ids) + ].copy() + + if result_df.empty: + return pd.DataFrame() + + # Step 5: Apply DTO filters if provided + if dto_filters: + # Get binding dataset info for filter application + binding_repo, binding_config = binding_dataset + result_df = self._apply_filters( + result_df, dto_filters, binding_repo, binding_config + ) + + # Step 6: Select requested fields if specified + if fields: + available_fields = [f for f in fields if f in result_df.columns] + if available_fields: + result_df = result_df[available_fields].copy() + + return result_df + def materialize_views(self, datasets: list[tuple[str, str]] | None = None) -> None: """ Build and cache metadata DataFrames for faster subsequent queries. @@ -605,296 +614,6 @@ def invalidate_cache(self, datasets: list[tuple[str, str]] | None = None) -> Non if dataset_key in self.cache: del self.cache[dataset_key] - def _build_comparative_links(self) -> dict[tuple[str, str], dict[str, Any]]: - """ - Build mapping of primary datasets to their comparative dataset references. - - Returns dict keyed by (repo_id, config_name) with value being dict: { - "comparative_analyses": [ { "repo": comparative_repo_id, - "dataset": comparative_config_name, "via_field": - field_name_with_composite_ids } ] } - - """ - links: dict[tuple[str, str], dict[str, Any]] = {} - - for repo_id, repo_config in self.config.repositories.items(): - if not repo_config.dataset: - continue - - for config_name, dataset_config in repo_config.dataset.items(): - if dataset_config.comparative_analyses: - links[(repo_id, config_name)] = { - "comparative_analyses": [ - { - "repo": ca.repo, - "dataset": ca.dataset, - "via_field": ca.via_field, - } - for ca in dataset_config.comparative_analyses - ] - } - - return links - - def _get_comparative_fields_for_dataset( - self, repo_id: str, config_name: str - ) -> dict[str, dict[str, str]]: - """ - Get mapping of comparative fields available for a primary dataset. - - :param repo_id: Primary dataset repository ID - :param config_name: Primary dataset config name - :return: Dict mapping field_name to comparative dataset info - {field_name: { - "comp_repo": comparative_repo_id, - "comp_dataset": comparative_dataset_name, - "via_field": field_with_composite_ids - }} - - Example: - For callingcards dataset linked to DTO via binding_id: - { - "dto_fdr": { - "comp_repo": "BrentLab/yeast_comparative_analysis", - "comp_dataset": "dto", - "via_field": "binding_id" - }, - "dto_empirical_pvalue": {...} - } - - """ - field_mapping: dict[str, dict[str, str]] = {} - - # Get comparative analyses for this dataset - links = self._comparative_links.get((repo_id, config_name), {}) - if "comparative_analyses" not in links: - return field_mapping - - # For each comparative dataset, get its fields - for ca in links["comparative_analyses"]: - comp_repo = ca["repo"] - comp_dataset = ca["dataset"] - via_field = ca["via_field"] - - # Get fields from comparative dataset - comp_fields = self.get_fields(comp_repo, comp_dataset) - - # If no fields from config, try DataCard - if not comp_fields: - try: - from tfbpapi.datacard import DataCard - - card = DataCard(comp_repo, token=self.token) - config = card.get_config(comp_dataset) - if config and config.dataset_info: - comp_fields = [f.name for f in config.dataset_info.features] - except Exception: - comp_fields = [] - - # Map each field to this comparative dataset - for field_name in comp_fields: - # Skip the via_field itself (it's the join key) - if field_name == via_field: - continue - - field_mapping[field_name] = { - "comp_repo": comp_repo, - "comp_dataset": comp_dataset, - "via_field": via_field, - } - - return field_mapping - - def _enrich_with_comparative_data( - self, - primary_df: pd.DataFrame, - repo_id: str, - config_name: str, - requested_fields: list[str], - ) -> pd.DataFrame: - """ - Enrich primary dataset with fields from comparative datasets. - - :param primary_df: Primary dataset DataFrame with sample_id column - :param repo_id: Primary dataset repository ID - :param config_name: Primary dataset config name - :param requested_fields: List of field names requested by user - :return: DataFrame enriched with comparative fields - - """ - # Get mapping of which fields come from which comparative datasets - comp_field_mapping = self._get_comparative_fields_for_dataset( - repo_id, config_name - ) - - if not comp_field_mapping: - return primary_df - - # Find which requested fields are from comparative datasets - comp_fields_to_fetch = [f for f in requested_fields if f in comp_field_mapping] - - if not comp_fields_to_fetch: - return primary_df - - # Group fields by comparative dataset to minimize queries - by_comp_dataset: dict[tuple[str, str, str], list[str]] = {} - for field in comp_fields_to_fetch: - info = comp_field_mapping[field] - key = (info["comp_repo"], info["comp_dataset"], info["via_field"]) - if key not in by_comp_dataset: - by_comp_dataset[key] = [] - by_comp_dataset[key].append(field) - - # For each comparative dataset, load and join - result_df = primary_df.copy() - - for (comp_repo, comp_dataset, via_field), fields in by_comp_dataset.items(): - try: - # Load comparative dataset using HfCacheManager - # but query the raw data table instead of metadata view - from tfbpapi.hf_cache_manager import HfCacheManager - - comp_cache_mgr = HfCacheManager( - comp_repo, duckdb_conn=duckdb.connect(":memory:"), token=self.token - ) - - # Get the config to load data - comp_config = comp_cache_mgr.get_config(comp_dataset) - if not comp_config: - continue - - # Load the data (this will download and register parquet files) - result = comp_cache_mgr._get_metadata_for_config(comp_config) - if not result.get("success", False): - continue - - # Now query the raw data table directly (not the metadata view) - # The raw table name is config_name without "metadata_" prefix - select_fields = [via_field] + fields - columns = ", ".join(select_fields) - - # Query the actual parquet data by creating a view from the files - try: - # Get file paths that were loaded - import glob - - from huggingface_hub import snapshot_download - - cache_dir = snapshot_download( - repo_id=comp_repo, - repo_type="dataset", - allow_patterns=f"{comp_dataset}/**/*.parquet", - token=self.token, - ) - - parquet_files = glob.glob( - f"{cache_dir}/{comp_dataset}/**/*.parquet", recursive=True - ) - - if not parquet_files: - continue - - # Create a temporary view from parquet files - temp_view = f"temp_{comp_dataset}_raw" - files_sql = ", ".join([f"'{f}'" for f in parquet_files]) - comp_cache_mgr.duckdb_conn.execute( - f"CREATE OR REPLACE VIEW {temp_view} AS " - f"SELECT * FROM read_parquet([{files_sql}])" - ) - - # Query the view - sql = f"SELECT {columns} FROM {temp_view}" - comp_df = comp_cache_mgr.duckdb_conn.execute(sql).fetchdf() - - except Exception: - # If direct parquet loading fails, skip this comparative dataset - continue - - if comp_df.empty: - continue - - # Parse composite identifiers to extract sample_id - # via_field contains values like - # "BrentLab/harbison_2004;harbison_2004;123" - # We need to extract the third component and match on - # current repo/config - def extract_sample_id(composite_id: str) -> str | None: - """Extract sample_id if composite matches current dataset.""" - if pd.isna(composite_id): - return None - try: - parts = composite_id.split(";") - if len(parts) != 3: - return None - # Check if this composite ID references our dataset - if parts[0] == repo_id and parts[1] == config_name: - return parts[2] - return None - except Exception: - return None - - comp_df["_join_sample_id"] = comp_df[via_field].apply(extract_sample_id) - - # Convert _join_sample_id to match primary_df sample_id dtype - # This handles cases where sample_id is int but composite has string - if "_join_sample_id" in comp_df.columns: - primary_dtype = primary_df["sample_id"].dtype - if pd.api.types.is_integer_dtype(primary_dtype): - # Convert to numeric, coercing errors to NaN - comp_df["_join_sample_id"] = pd.to_numeric( - comp_df["_join_sample_id"], errors="coerce" - ) - elif pd.api.types.is_string_dtype(primary_dtype): - comp_df["_join_sample_id"] = comp_df["_join_sample_id"].astype( - str - ) - - # Filter to only rows that match our dataset - comp_df = comp_df[comp_df["_join_sample_id"].notna()].copy() - - if comp_df.empty: - continue - - # Drop the via_field column (we don't need it in results) - comp_df = comp_df.drop(columns=[via_field]) - - # Merge with primary data - result_df = result_df.merge( - comp_df, left_on="sample_id", right_on="_join_sample_id", how="left" - ) - - # Drop the temporary join column - result_df = result_df.drop(columns=["_join_sample_id"]) - - except Exception: - # If enrichment fails for this comparative dataset, continue - continue - - return result_df - - @staticmethod - def _parse_composite_identifier(composite_id: str) -> tuple[str, str, str]: - """ - Parse composite sample identifier into components. - - :param composite_id: Composite ID in format "repo_id;config_name;sample_id" - :return: Tuple of (repo_id, config_name, sample_id) - - Example: - _parse_composite_identifier( - "BrentLab/harbison_2004;harbison_2004;sample_42" - ) - Returns: ("BrentLab/harbison_2004", "harbison_2004", "sample_42") - - """ - parts = composite_id.split(";") - if len(parts) != 3: - raise ValueError( - f"Invalid composite ID format: {composite_id}. " - "Expected 'repo_id;config_name;sample_id'" - ) - return parts[0], parts[1], parts[2] - def _build_metadata_table( self, repo_id: str, config_name: str, use_cache: bool = True ) -> pd.DataFrame: @@ -941,19 +660,23 @@ def _build_metadata_table( # Get sample-level data from HuggingFace config = card.get_config(config_name) - # Check if this is a comparative dataset - from tfbpapi.models import DatasetType - - is_comparative = ( - config - and hasattr(config, "dataset_type") - and config.dataset_type == DatasetType.COMPARATIVE - ) + # Check if sample_id exists in the data by trying a sample query + has_sample_id = False + try: + sample_df = cache_mgr.query( + f"SELECT sample_id FROM {config_name} LIMIT 1", config_name + ) + has_sample_id = "sample_id" in sample_df.columns + except Exception: + # If query fails, assume sample_id doesn't exist + has_sample_id = False if config and hasattr(config, "metadata_fields") and config.metadata_fields: # Select only metadata fields columns = ", ".join(config.metadata_fields) - if not is_comparative and "sample_id" not in config.metadata_fields: + # Only add sample_id field if it exists in the data + # and not already in metadata_fields + if has_sample_id and "sample_id" not in config.metadata_fields: columns = f"sample_id, {columns}" sql = f"SELECT DISTINCT {columns} FROM {config_name}" else: @@ -962,9 +685,12 @@ def _build_metadata_table( df = cache_mgr.query(sql, config_name) - # For non-comparative datasets: one row per sample_id - # For comparative datasets: keep all rows (each row is a relationship) - if not is_comparative and "sample_id" in df.columns: + # If sample_id doesn't exist, generate from row number + if "sample_id" not in df.columns and not df.empty: + df["sample_id"] = df.index.astype(str) + + # One row per sample_id + if "sample_id" in df.columns: df = df.groupby("sample_id").first().reset_index() # Add repo-level metadata as columns @@ -976,8 +702,8 @@ def _build_metadata_table( if field_metadata: df = self._add_field_metadata(df, field_metadata) - # Apply dtype conversions to DataFrame columns - df = self._apply_column_dtypes(df, property_mappings) + # Join comparative analyses data if configured + df = self._join_comparative_analyses(df, repo_id, config_name) # Cache result if use_cache: @@ -985,73 +711,9 @@ def _build_metadata_table( return df - except Exception as e: - # Log error for debugging with full traceback - import traceback - - print(f"Error downloading metadata for {config_name}: {e}") - traceback.print_exc() - # Return empty DataFrame on error + except Exception: return pd.DataFrame() - def _apply_column_dtypes( - self, df: pd.DataFrame, property_mappings: dict[str, PropertyMapping] - ) -> pd.DataFrame: - """ - Apply dtype conversions to DataFrame columns based on property mappings. - - :param df: DataFrame to apply conversions to - :param property_mappings: Property mappings with dtype specifications - :return: DataFrame with converted column dtypes - - """ - for prop_name, mapping in property_mappings.items(): - # Skip if no dtype specified or column doesn't exist - if not mapping.dtype or prop_name not in df.columns: - continue - - # Convert column dtype - try: - if mapping.dtype == "numeric": - df[prop_name] = pd.to_numeric(df[prop_name], errors="coerce") - elif mapping.dtype == "bool": - df[prop_name] = df[prop_name].astype(bool) - elif mapping.dtype == "string": - df[prop_name] = df[prop_name].astype(str) - except (ValueError, TypeError): - # Conversion failed, leave as is - pass - - return df - - def _convert_dtype(self, value: Any, dtype: str) -> Any: - """ - Convert value to specified data type. - - :param value: The value to convert to a given `dtype` - :param dtype: Target data type ("numeric", "bool", "string") - - :return: Converted value or None if conversion fails - - """ - if value is None: - return None - - try: - if dtype == "numeric": - # Try float first (handles both int and float) - return float(value) - elif dtype == "bool": - return bool(value) - elif dtype == "string": - return str(value) - else: - # Unknown dtype, pass through unchanged - return value - except (ValueError, TypeError): - # Conversion failed, return None - return None - def _extract_repo_level( self, card: DataCard, @@ -1085,12 +747,14 @@ def _extract_repo_level( continue # Build full path - # Note: `conditions` is already the experimental_conditions dict, - # so we don't add the prefix full_path = mapping.path + # Skip if path is None (shouldn't happen for repo-level, but be safe) + if full_path is None: + continue + # Get value at path - value = get_nested_value(conditions, full_path) # type: ignore + value = get_nested_value(conditions, full_path) # Handle missing values missing_label = self.config.missing_value_labels.get(prop_name) @@ -1102,12 +766,6 @@ def _extract_repo_level( # Ensure value is a list actual_values = [value] if not isinstance(value, list) else value - # Apply dtype conversion if specified - if mapping.dtype: - actual_values = [ - self._convert_dtype(v, mapping.dtype) for v in actual_values - ] - # Normalize using aliases aliases = self.config.factor_aliases.get(prop_name) normalized_values = [ @@ -1136,18 +794,17 @@ def _extract_field_level( field_metadata: dict[str, dict[str, Any]] = {} # Group property mappings by field - field_mappings: dict[str, dict[str, PropertyMapping]] = {} + field_mappings: dict[str, dict[str, str | None]] = {} for prop_name, mapping in property_mappings.items(): - # Only process if field is specified AND path exists - # (no path means it's just a column alias, not metadata extraction) - if mapping.field is not None and mapping.path is not None: + if mapping.field is not None: field_name = mapping.field if field_name not in field_mappings: field_mappings[field_name] = {} - field_mappings[field_name][prop_name] = mapping + # Store path (can be None for column aliases) + field_mappings[field_name][prop_name] = mapping.path # Process each field that has mappings - for field_name, prop_mappings_dict in field_mappings.items(): + for field_name, prop_paths in field_mappings.items(): # Get field definitions definitions = card.get_field_definitions(config_name, field_name) if not definitions: @@ -1158,9 +815,13 @@ def _extract_field_level( if field_value not in field_metadata: field_metadata[field_value] = {} - for prop_name, mapping in prop_mappings_dict.items(): - # Get value at path - value = get_nested_value(definition, mapping.path) # type: ignore + for prop_name, path in prop_paths.items(): + # Handle path=None case: use field_value directly + if path is None: + value = field_value + else: + # Get value at path + value = get_nested_value(definition, path) # Handle missing values missing_label = self.config.missing_value_labels.get(prop_name) @@ -1172,12 +833,6 @@ def _extract_field_level( # Ensure value is a list actual_values = [value] if not isinstance(value, list) else value - # Apply dtype conversion if specified - if mapping.dtype: - actual_values = [ - self._convert_dtype(v, mapping.dtype) for v in actual_values - ] - # Normalize using aliases aliases = self.config.factor_aliases.get(prop_name) normalized_values = [ @@ -1243,23 +898,31 @@ def _apply_filters( # Handle numeric range filters if isinstance(filter_value, tuple): operator = filter_value[0] + # For numeric comparisons, try to convert column to numeric + # (normalize_value returns strings, + # but we need numeric for range queries) + try: + df_field = pd.to_numeric(df[field], errors="coerce") + except (ValueError, TypeError): + df_field = df[field] + if operator == "between" and len(filter_value) == 3: df = df[ - (df[field] >= filter_value[1]) & (df[field] <= filter_value[2]) + (df_field >= filter_value[1]) & (df_field <= filter_value[2]) ] elif operator in (">=", ">", "<=", "<", "==", "!="): if operator == ">=": - df = df[df[field] >= filter_value[1]] + df = df[df_field >= filter_value[1]] elif operator == ">": - df = df[df[field] > filter_value[1]] + df = df[df_field > filter_value[1]] elif operator == "<=": - df = df[df[field] <= filter_value[1]] + df = df[df_field <= filter_value[1]] elif operator == "<": - df = df[df[field] < filter_value[1]] + df = df[df_field < filter_value[1]] elif operator == "==": - df = df[df[field] == filter_value[1]] + df = df[df_field == filter_value[1]] elif operator == "!=": - df = df[df[field] != filter_value[1]] + df = df[df_field != filter_value[1]] else: # Exact match with alias expansion aliases = self.config.factor_aliases.get(field) @@ -1273,9 +936,11 @@ def _apply_filters( df = df[df[field].isin(expanded_values)] else: # No aliases, exact match - df = df[df[field] == filter_value] + # Handle type conversion: normalize_value returns strings, + # so convert filter_value to string for comparison + df = df[df[field] == str(filter_value)] - return df + return df.copy() def _get_complete_data( self, @@ -1331,6 +996,264 @@ def _get_complete_data( except Exception: return pd.DataFrame() + @staticmethod + def _parse_composite_identifier(composite_id: str) -> tuple[str, str, str]: + """ + Parse composite identifier into repo_id, config_name, and sample_id. + + Format: "repo_id;config_name;sample_id" + + :param composite_id: Composite identifier string + :return: Tuple of (repo_id, config_name, sample_id) + :raises ValueError: If format is invalid + + Example: + >>> VirtualDB._parse_composite_identifier( + ... "BrentLab/harbison_2004;harbison_2004;42" + ... ) + ("BrentLab/harbison_2004", "harbison_2004", "42") + + """ + parts = composite_id.split(";") + if len(parts) != 3: + raise ValueError( + f"Invalid composite ID format: {composite_id}. " + "Expected format: 'repo_id;config_name;sample_id'" + ) + return tuple(parts) # type: ignore + + def _join_comparative_analyses( + self, df: pd.DataFrame, repo_id: str, config_name: str + ) -> pd.DataFrame: + """ + Join comparative analyses data to the primary dataset DataFrame. + + For each comparative_analysis configured for this dataset, loads the comparative + dataset directly via SQL and joins fields via composite identifiers. + + :param df: Primary dataset DataFrame with sample_id column + :param repo_id: Repository ID of the primary dataset + :param config_name: Config name of the primary dataset + :return: DataFrame with joined comparative analysis fields + + """ + if df.empty or "sample_id" not in df.columns: + return df + + # Get dataset configuration + repo_config = self.config.get_repository_config(repo_id) + if not repo_config or not repo_config.dataset: + return df + + dataset_config = repo_config.dataset.get(config_name) + if not dataset_config or not dataset_config.comparative_analyses: + return df + + result_df = df.copy() + + # Process each comparative analysis + for comp_analysis in dataset_config.comparative_analyses: + try: + # Build composite identifier column for join + # Format: "repo_id;config_name;sample_id" + temp_composite_col = "_temp_composite_id" + result_df[temp_composite_col] = f"{repo_id};{config_name};" + result_df[ + "sample_id" + ].astype(str) + + # Get property mappings for comparative dataset + comp_mappings = self.config.get_property_mappings( + comp_analysis.repo, comp_analysis.dataset + ) + + # Build mapping from property names to actual column names + # PropertyMapping.field points to the actual column name in the dataset + prop_to_col: dict[str, str] = {} + for prop_name, mapping in comp_mappings.items(): + if mapping.field: + prop_to_col[prop_name] = mapping.field + else: + prop_to_col[prop_name] = prop_name + + # Get via_field actual column name + via_field = comp_analysis.via_field + via_field_col = prop_to_col.get(via_field, via_field) + + # Determine which fields to select from comparative dataset + # Exclude via_field and sample_id from the join fields + fields_to_join_props = [ + prop_name + for prop_name in prop_to_col.keys() + if prop_name not in ["sample_id", via_field, "dataset_id"] + ] + + if not fields_to_join_props: + result_df = result_df.drop(columns=[temp_composite_col]) + continue + + # Build SQL columns: actual column names + sql_columns = [via_field_col] + sql_columns.extend([prop_to_col[prop] for prop in fields_to_join_props]) + sql_columns_str = ", ".join(sql_columns) + + # Load comparative dataset directly via SQL + # This bypasses _build_metadata_table which may not work + # for comparative datasets + comp_cache_mgr = HfCacheManager( + comp_analysis.repo, + duckdb_conn=duckdb.connect(":memory:"), + token=self.token, + ) + + # Get the actual table name (metadata_{config_name}) + # to avoid string replacement issues + comp_config = comp_cache_mgr.get_config(comp_analysis.dataset) + if not comp_config: + result_df = result_df.drop(columns=[temp_composite_col]) + continue + + # Load the config to get the actual table name + config_result = comp_cache_mgr._get_metadata_for_config( + comp_config, force_refresh=False + ) + if not config_result.get("success", False): + result_df = result_df.drop(columns=[temp_composite_col]) + continue + + actual_table_name = config_result.get("table_name") + if not actual_table_name: + actual_table_name = f"metadata_{comp_analysis.dataset}" + + # Build WHERE clause to filter only matching records + # Try both original repo_id and capitalized version + # (e.g., hackett_2020 -> Hackett_2020) + composite_ids = result_df[temp_composite_col].unique().tolist() + + # Generate alternative repo_id format with capitalized + # first letter after slash + # e.g., "BrentLab/hackett_2020" -> "BrentLab/Hackett_2020" + alternative_repo_id = repo_id + if "/" in repo_id: + parts = repo_id.split("/", 1) + if len(parts) == 2 and parts[1]: + # Capitalize first letter of dataset name + alternative_repo_id = ( + f"{parts[0]}/{parts[1][0].upper()}{parts[1][1:]}" + ) + + # Build composite IDs with both formats + all_composite_ids = set(composite_ids) # Original format + if alternative_repo_id != repo_id: + # Add alternative format for each sample_id + for sample_id in result_df["sample_id"].astype(str).unique(): + alt_id = f"{alternative_repo_id};{config_name};{sample_id}" + all_composite_ids.add(alt_id) + + # Add forward-slash format variants + # (e.g., "BrentLab/rossi_2021/rossi_2021_af_combined;{sample_id}") + # This handles cases where DTO data uses "/" instead of ";" + # between repo_id and config_name + for sample_id in result_df["sample_id"].astype(str).unique(): + # Format: "repo_id/config_name;sample_id" + slash_format_id = f"{repo_id}/{config_name};{sample_id}" + all_composite_ids.add(slash_format_id) + # Also add capitalized variant if applicable + if alternative_repo_id != repo_id: + alt_slash_format_id = ( + f"{alternative_repo_id}/{config_name};{sample_id}" + ) + all_composite_ids.add(alt_slash_format_id) + + # Escape single quotes in composite IDs + escaped_ids = [cid.replace("'", "''") for cid in all_composite_ids] + id_list = ", ".join([f"'{cid}'" for cid in escaped_ids]) + + # Use actual table name directly to avoid column name replacement issues + sql = f""" + SELECT {sql_columns_str} + FROM {actual_table_name} + WHERE {via_field_col} IN ({id_list}) + """ + + # Execute query directly instead of using query() method + # to avoid string replacement + try: + comp_df = comp_cache_mgr.duckdb_conn.execute(sql).fetchdf() + + except Exception: + result_df = result_df.drop(columns=[temp_composite_col]) + continue + + if comp_df.empty: + result_df = result_df.drop(columns=[temp_composite_col]) + continue + + # Rename columns to use property names (config names) + # instead of raw column names + rename_dict = {via_field_col: via_field} + for prop_name in fields_to_join_props: + actual_col = prop_to_col[prop_name] + if actual_col != prop_name: + rename_dict[actual_col] = prop_name + + comp_df = comp_df.rename(columns=rename_dict) + + # Map DTO composite IDs back to original format for join + # This handles cases where DTO uses: + # 1. Capitalized repo_id (e.g., Hackett_2020) + # 2. Forward-slash format + # (e.g., "BrentLab/rossi_2021/rossi_2021_af_combined;{sample_id}") + if via_field in comp_df.columns: + # Create mapping from all alternative formats to original format + id_mapping = {} + for sample_id in result_df["sample_id"].astype(str).unique(): + original_id = f"{repo_id};{config_name};{sample_id}" + + # Add capitalized variant mapping + if alternative_repo_id != repo_id: + alt_id = f"{alternative_repo_id};{config_name};{sample_id}" + id_mapping[alt_id] = original_id + + # Add forward-slash format mappings + slash_format_id = f"{repo_id}/{config_name};{sample_id}" + id_mapping[slash_format_id] = original_id + if alternative_repo_id != repo_id: + alt_slash_format_id = ( + f"{alternative_repo_id}/{config_name};{sample_id}" + ) + id_mapping[alt_slash_format_id] = original_id + + # Map alternative format IDs to original format + comp_df[temp_composite_col] = comp_df[via_field].map( + lambda x: id_mapping.get(x, x) if x in id_mapping else x + ) + # Use mapped column for join + join_right_on = temp_composite_col + else: + # No alternative format needed, use original via_field + join_right_on = via_field + + # Perform left join on composite identifier + result_df = result_df.merge( + comp_df, + left_on=temp_composite_col, + right_on=join_right_on, + how="left", + suffixes=("", f"_{comp_analysis.dataset}"), + ) + + # Drop the temporary composite_id column + result_df = result_df.drop(columns=[temp_composite_col]) + + except Exception: + + # Clean up temp column if it exists + if temp_composite_col in result_df.columns: + result_df = result_df.drop(columns=[temp_composite_col]) + continue + + return result_df + def __repr__(self) -> str: """String representation.""" n_repos = len(self.config.repositories)