#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2026-01-29 22:30:00 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex-dataset/src/scitex_dataset/__init__.py
"""
SciTeX Dataset - Unified interface for scientific dataset discovery.
Domains:
- neuroscience: OpenNeuro, DANDI, PhysioNet
- general: Scientific Data, Zenodo
- biology: GEO (Gene Expression Omnibus)
- pharmacology: ChEMBL
- medical: ClinicalTrials.gov
Usage:
>>> from scitex_dataset import neuroscience
>>> datasets = neuroscience.fetch_all_datasets(max_datasets=10)
>>> # Or direct import for convenience
>>> from scitex_dataset import fetch_all_datasets, search_datasets
>>> # Local database for fast searching
>>> from scitex_dataset import database as db
>>> db.build() # Fetch all sources and index
>>> results = db.search("alzheimer EEG", min_subjects=20)
"""
from __future__ import annotations
try:
from importlib.metadata import PackageNotFoundError
from importlib.metadata import version as _v
try:
__version__ = _v("scitex-dataset")
except PackageNotFoundError:
__version__ = "0.0.0+local"
del _v, PackageNotFoundError
except ImportError: # pragma: no cover — only on ancient Pythons
__version__ = "0.0.0+local"
# Domain submodules
from . import _api as _api # noqa: F401
from . import biology, database, general, medical, neuroscience, pharmacology
# Per-source ``<src>_fetch`` / ``<src>_format`` aliases — give every MCP
# tool a matching Python callable for the audit-mcp-tools § 6 parity
# check. See _api.py for the explicit list.
from ._api import ( # noqa: F401
chembl_fetch,
clinicaltrials_fetch,
dandi_fetch,
figshare_fetch,
geo_fetch,
huggingface_download_file,
huggingface_fetch,
huggingface_info,
huggingface_search,
moleculenet_fetch,
openml_fetch,
openneuro_fetch,
physionet_fetch,
zenodo_fetch,
)
# DB-level aliases for MCP parity (`dataset_db_*` tools).
from .database import build as db_build # noqa: F401
from .database import get_stats as db_show_stats # noqa: F401
from .database import search as db_search # noqa: F401
# Convenience exports from neuroscience.openneuro (primary source)
from .neuroscience.openneuro import (
OPENNEURO_API,
fetch_all_datasets,
fetch_datasets,
format_dataset,
)
from .search import search_datasets, sort_datasets
[docs]
def list_sources() -> dict:
"""Return the 11-source registry — matches ``dataset_list_sources`` MCP tool."""
from ._sources import SOURCE_INFO
return {"sources": SOURCE_INFO, "count": len(SOURCE_INFO)}
[docs]
def filter_results(datasets, **kwargs):
"""Filter and rank dataset dicts — matches ``dataset_filter_results`` MCP tool."""
from .search import search_datasets as _search
from .search import sort_datasets as _sort
sort_by = kwargs.pop("sort_by", "downloads")
limit = kwargs.pop("limit", None)
out = _search(datasets, **kwargs)
out = _sort(out, by=sort_by, descending=True)
return out[:limit] if limit else out
__all__ = [
"__version__",
# Domains
"neuroscience",
"general",
"biology",
"pharmacology",
"medical",
# Database submodule + db_* aliases (MCP parity)
"database",
"db_build",
"db_search",
"db_show_stats",
# Convenience (OpenNeuro)
"fetch_datasets",
"fetch_all_datasets",
"format_dataset",
# Search + filter
"search_datasets",
"sort_datasets",
"filter_results",
"list_sources",
# Per-source <src>_fetch aliases (MCP parity)
"openneuro_fetch",
"dandi_fetch",
"physionet_fetch",
"zenodo_fetch",
"figshare_fetch",
"openml_fetch",
"moleculenet_fetch",
"geo_fetch",
"chembl_fetch",
"clinicaltrials_fetch",
# HuggingFace family
"huggingface_fetch",
"huggingface_search",
"huggingface_info",
"huggingface_download_file",
]
# EOF