{
  "version": "2.2.0",
  "generated": "2026-06-01",
  "count": 126,
  "benchmarks": [
    {
      "id": "open-targets",
      "name": "Open Targets Platform",
      "stages": [
        "disease-modeling",
        "target-id"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "target-disease-association",
        "prioritization"
      ],
      "description": "Integrates genetic, genomic, pharmacological evidence to score targets for 20k+ diseases.",
      "size": {
        "targets": 63000,
        "diseases": 28000,
        "evidence": 18000000
      },
      "primary_paper": {
        "title": "The Open Targets Platform: supporting systematic drug-target identification and prioritisation",
        "authors": [
          "Ochoa D",
          "Hercules A",
          "Carmona M",
          "et al."
        ],
        "year": 2021,
        "doi": "10.1093/nar/gkaa1027",
        "citations": 1100
      },
      "official_url": "https://platform.opentargets.org/",
      "github_url": "https://github.com/opentargets",
      "leaderboard_url": "N/A",
      "license": "CC0 / Apache-2.0",
      "first_release": "2016-12",
      "last_updated": "2025-06",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Industry gold standard for target prioritization. Quarterly versioned releases.",
      "related_benchmarks": [
        "depmap",
        "disgenet",
        "primekg"
      ],
      "expert_ids": [
        "ian-dunham",
        "ellen-mcdonagh"
      ],
      "group_ids": [
        "opentargets",
        "embl-ebi",
        "gsk",
        "sanger"
      ],
      "hosted_by": [
        "elixir"
      ],
      "composite_score": 100.0,
      "experimental_validation": "wet-lab-confirmed",
      "dataset_url": "https://platform.opentargets.org/downloads",
      "paper_url": "https://doi.org/10.1093/nar/gkaa1027"
    },
    {
      "id": "depmap",
      "name": "DepMap (Cancer Dependency Map)",
      "stages": [
        "target-id",
        "disease-modeling"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "essentiality",
        "biomarker"
      ],
      "description": "Genome-scale CRISPR/RNAi essentiality across 1150 cancer cell lines + omics.",
      "size": {
        "cell_lines": 1150,
        "genes": 18000
      },
      "primary_paper": {
        "title": "Defining a Cancer Dependency Map",
        "authors": [
          "Tsherniak A",
          "Vazquez F",
          "Montgomery PG",
          "et al."
        ],
        "year": 2017,
        "doi": "10.1016/j.cell.2017.06.010",
        "citations": 2646
      },
      "official_url": "https://depmap.org/portal/",
      "github_url": "https://github.com/broadinstitute/depmap",
      "leaderboard_url": "https://depmap.org/portal/prediction/",
      "license": "CC-BY 4.0",
      "first_release": "2017-07",
      "last_updated": "2025-06",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Quarterly release cadence.",
      "related_benchmarks": [
        "open-targets",
        "lincs-l1000"
      ],
      "expert_ids": [
        "aviad-tsherniak",
        "william-hahn",
        "todd-golub"
      ],
      "group_ids": [
        "broad-depmap"
      ],
      "hosted_by": [],
      "composite_score": 100.0,
      "experimental_validation": "wet-lab-confirmed",
      "dataset_url": "https://depmap.org/portal/download/",
      "paper_url": "https://doi.org/10.1016/j.cell.2017.06.010"
    },
    {
      "id": "tdc-admet",
      "name": "TDC ADMET Group",
      "stages": [
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "regression",
        "classification"
      ],
      "description": "22-task ADMET benchmark suite with scaffold splits \u2014 core TDC leaderboard.",
      "size": {
        "tasks": 22,
        "molecules": 130000
      },
      "primary_paper": {
        "title": "Therapeutics Data Commons: Machine Learning Datasets and Tasks for Drug Discovery and Development",
        "authors": [
          "Huang K",
          "Fu T",
          "Gao W",
          "et al."
        ],
        "year": 2021,
        "doi": "10.48550/arXiv.2102.09548",
        "citations": 620
      },
      "official_url": "https://tdcommons.ai/benchmark/admet_group/overview/",
      "github_url": "https://github.com/mims-harvard/TDC",
      "leaderboard_url": "https://tdcommons.ai/benchmark/admet_group/overview/",
      "license": "MIT",
      "first_release": "2021-02",
      "last_updated": "2025-04",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Most-adopted ADMET benchmark. 100+ leaderboard submissions.",
      "related_benchmarks": [
        "moleculenet",
        "admet-ai",
        "polaris-admet",
        "molecule-ace"
      ],
      "expert_ids": [
        "kexin-huang",
        "marinka-zitnik",
        "tianfan-fu"
      ],
      "group_ids": [
        "zitnik-lab",
        "tdc"
      ],
      "hosted_by": [
        "tdc",
        "insilico-scienceaibench",
        "insilico-ddb"
      ],
      "composite_score": 100.0,
      "experimental_validation": "retrospective",
      "huggingface_url": "https://huggingface.co/datasets/tdcommons/tdc",
      "paper_url": "https://doi.org/10.48550/arXiv.2102.09548"
    },
    {
      "id": "sabdab",
      "name": "SAbDab",
      "stages": [
        "hit-id",
        "lead-id-admet",
        "developmental-candidate"
      ],
      "modalities": [
        "biologic-mab"
      ],
      "task_types": [
        "antibody-structure",
        "affinity"
      ],
      "description": "Structural antibody database \u2014 curated PDB antibody structures with annotation.",
      "size": {
        "structures": 9500,
        "complexes": 2600
      },
      "primary_paper": {
        "title": "SAbDab: the structural antibody database",
        "authors": [
          "Dunbar J",
          "Krawczyk K",
          "Leem J",
          "et al."
        ],
        "year": 2014,
        "doi": "10.1093/nar/gkt1043",
        "citations": 676
      },
      "official_url": "https://opig.stats.ox.ac.uk/webapps/newsabdab/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2013",
      "last_updated": "2025-04",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Canonical antibody structure resource. Weekly updates.",
      "related_benchmarks": [
        "oas",
        "cov-abdab",
        "iglm-bench"
      ],
      "expert_ids": [
        "charlotte-deane",
        "james-dunbar"
      ],
      "group_ids": [
        "oxford-opig"
      ],
      "hosted_by": [],
      "composite_score": 100.0,
      "experimental_validation": "retrospective",
      "dataset_url": "https://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/sabdab",
      "paper_url": "https://doi.org/10.1093/nar/gkt1043"
    },
    {
      "id": "chembl",
      "name": "ChEMBL",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule",
        "protein-general"
      ],
      "task_types": [
        "bioactivity",
        "data-resource"
      ],
      "description": "Manually curated bioactive molecule DB; backbone for most ML chemistry benchmarks.",
      "size": {
        "compounds": 2400000,
        "activities": 20700000,
        "targets": 15398
      },
      "primary_paper": {
        "title": "The ChEMBL Database in 2023",
        "authors": [
          "Zdrazil B",
          "Felix E",
          "Hunter F",
          "et al."
        ],
        "year": 2024,
        "doi": "10.1093/nar/gkad1004",
        "citations": 1018
      },
      "official_url": "https://www.ebi.ac.uk/chembl/",
      "github_url": "https://github.com/chembl",
      "leaderboard_url": "N/A",
      "license": "CC-BY-SA 3.0",
      "first_release": "2009",
      "last_updated": "2025-05",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Underlies ~80% of public bioactivity ML benchmarks.",
      "related_benchmarks": [
        "tdc-admet",
        "moleculenet",
        "pubchem-bioassay"
      ],
      "expert_ids": [
        "andrew-leach",
        "barbara-zdrazil"
      ],
      "group_ids": [
        "embl-ebi"
      ],
      "hosted_by": [
        "elixir"
      ],
      "composite_score": 97.5,
      "experimental_validation": "wet-lab-confirmed",
      "dataset_url": "https://www.ebi.ac.uk/chembl/downloads",
      "paper_url": "https://doi.org/10.1093/nar/gkad1004"
    },
    {
      "id": "oas",
      "name": "Observed Antibody Space (OAS)",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "biologic-mab"
      ],
      "task_types": [
        "antibody-sequence"
      ],
      "description": "Repository of >1B antibody BCR sequences from public repertoires \u2014 core for antibody LM pretraining.",
      "size": {
        "sequences": 2400000000,
        "repertoires": 15000
      },
      "primary_paper": {
        "title": "Observed Antibody Space: A diverse database of cleaned, annotated, and translated unpaired and paired antibody sequences",
        "authors": [
          "Olsen TH",
          "Boyles F",
          "Deane CM"
        ],
        "year": 2022,
        "doi": "10.1002/pro.4205",
        "citations": 331
      },
      "official_url": "https://opig.stats.ox.ac.uk/webapps/oas/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2018",
      "last_updated": "2025-04",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Underlies AbLang, IgLM, AntiBERTa \u2014 industry-adopted.",
      "related_benchmarks": [
        "sabdab",
        "cov-abdab",
        "iglm-bench"
      ],
      "expert_ids": [
        "charlotte-deane",
        "tobias-olsen"
      ],
      "group_ids": [
        "oxford-opig"
      ],
      "hosted_by": [],
      "composite_score": 97.5,
      "experimental_validation": "retrospective",
      "dataset_url": "https://opig.stats.ox.ac.uk/webapps/oas/",
      "paper_url": "https://doi.org/10.1002/pro.4205"
    },
    {
      "id": "proteingym",
      "name": "ProteinGym",
      "stages": [
        "target-id",
        "lead-id-admet",
        "ind-enabling"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "variant-effect"
      ],
      "description": "217 DMS substitution assays + indel + clinical variants \u2014 de facto standard for VEPs.",
      "size": {
        "dms_assays": 217,
        "mutations": 2700000,
        "clinical_variants": 2525
      },
      "primary_paper": {
        "title": "ProteinGym: Large-Scale Benchmarks for Protein Fitness Prediction and Design",
        "authors": [
          "Notin P",
          "Kollasch A",
          "Ritter D",
          "et al."
        ],
        "year": 2023,
        "doi": "10.48550/arXiv.2305.06259",
        "citations": 320
      },
      "official_url": "https://proteingym.org/",
      "github_url": "https://github.com/OATML-Markslab/ProteinGym",
      "leaderboard_url": "https://proteingym.org/benchmarks",
      "license": "MIT",
      "first_release": "2022",
      "last_updated": "2025-03",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Field standard. Clinical track enables fair ESM/EVE/AlphaMissense comparison.",
      "related_benchmarks": [
        "flip",
        "cafa-benchmark"
      ],
      "expert_ids": [
        "debora-marks",
        "pascal-notin",
        "yarin-gal"
      ],
      "group_ids": [
        "marks-lab",
        "oatml"
      ],
      "hosted_by": [
        "proteingym"
      ],
      "composite_score": 97.5,
      "experimental_validation": "wet-lab-confirmed",
      "dataset_url": "https://proteingym.org/download",
      "paper_url": "https://doi.org/10.48550/arXiv.2305.06259"
    },
    {
      "id": "posebusters",
      "name": "PoseBusters",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule",
        "protein-general"
      ],
      "task_types": [
        "pose-validation"
      ],
      "description": "Physics-aware eval of docking/co-folding poses \u2014 19 checks catching chemically impossible outputs.",
      "size": {
        "complexes": 428,
        "checks_per_pose": 19
      },
      "primary_paper": {
        "title": "PoseBusters: AI-based docking methods fail to generate physically valid poses or generalise to novel sequences",
        "authors": [
          "Buttenschoen M",
          "Morris GM",
          "Deane CM"
        ],
        "year": 2024,
        "doi": "10.1039/D3SC04185A",
        "citations": 360
      },
      "official_url": "https://posebusters.readthedocs.io/",
      "github_url": "https://github.com/maabuu/posebusters",
      "leaderboard_url": "https://github.com/maabuu/posebusters",
      "license": "BSD-3-Clause",
      "first_release": "2023-08",
      "last_updated": "2025-02",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Exposed major failure modes in AlphaFold-Multimer/DiffDock/RFAA. Default pharma filter.",
      "related_benchmarks": [
        "plinder",
        "pinder",
        "casf-2016"
      ],
      "expert_ids": [
        "charlotte-deane",
        "martin-buttenschoen"
      ],
      "group_ids": [
        "oxford-opig"
      ],
      "hosted_by": [
        "posebusters-initiative"
      ],
      "composite_score": 97.0,
      "experimental_validation": "retrospective",
      "dataset_url": "https://zenodo.org/records/8278563",
      "paper_url": "https://doi.org/10.1039/D3SC04185A"
    },
    {
      "id": "plinder",
      "name": "PLINDER",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule",
        "protein-general"
      ],
      "task_types": [
        "docking",
        "structure-based-benchmark"
      ],
      "description": "Leakage-aware protein-ligand interaction dataset with stratified splits.",
      "size": {
        "complexes": 1400000,
        "unique_systems": 460000
      },
      "primary_paper": {
        "title": "PLINDER: The protein-ligand interactions dataset and evaluation resource",
        "authors": [
          "Durairaj J",
          "Adeshina Y",
          "Cao Z",
          "et al."
        ],
        "year": 2024,
        "doi": "10.48550/arXiv.2409.17475",
        "citations": 55
      },
      "official_url": "https://www.plinder.sh/",
      "github_url": "https://github.com/plinder-org/plinder",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2024-09",
      "last_updated": "2025-03",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Replaces PDBbind as the modern leakage-controlled docking standard.",
      "related_benchmarks": [
        "pdbbind",
        "pinder",
        "posebusters"
      ],
      "expert_ids": [
        "torsten-schwede",
        "max-jaderberg"
      ],
      "group_ids": [
        "biozentrum-basel",
        "isomorphic-labs",
        "vantai"
      ],
      "hosted_by": [
        "plinder-initiative"
      ],
      "composite_score": 97.0,
      "experimental_validation": "retrospective",
      "dataset_url": "https://www.plinder.sh/datasets",
      "paper_url": "https://doi.org/10.48550/arXiv.2409.17475"
    },
    {
      "id": "stringdb",
      "name": "STRING",
      "stages": [
        "target-id",
        "disease-modeling"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "ppi",
        "network-inference"
      ],
      "description": "Protein-protein interaction & functional association network across 12k organisms.",
      "size": {
        "proteins": 67000000,
        "associations": 20000000000,
        "organisms": 12535
      },
      "primary_paper": {
        "title": "The STRING database in 2023",
        "authors": [
          "Szklarczyk D",
          "Kirsch R",
          "Koutrouli M",
          "et al."
        ],
        "year": 2023,
        "doi": "10.1093/nar/gkac1000",
        "citations": 6894
      },
      "official_url": "https://string-db.org/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2000",
      "last_updated": "2024-11",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Workhorse for network-based target ID. Distinguish functional vs physical edges.",
      "related_benchmarks": [
        "open-targets",
        "primekg"
      ],
      "expert_ids": [
        "christian-von-mering"
      ],
      "group_ids": [
        "sib-swiss"
      ],
      "hosted_by": [
        "elixir"
      ],
      "composite_score": 94.9,
      "experimental_validation": "retrospective",
      "dataset_url": "https://string-db.org/cgi/download",
      "paper_url": "https://doi.org/10.1093/nar/gkac1000"
    },
    {
      "id": "casp15",
      "name": "CASP15",
      "stages": [
        "hit-id",
        "target-id"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "structure-prediction"
      ],
      "description": "CASP15 blind structure prediction including multimer and ligand-bound categories.",
      "size": {
        "targets": 127,
        "categories": 5
      },
      "primary_paper": {
        "title": "Critical assessment of methods of protein structure prediction (CASP) \u2014 Round XV",
        "authors": [
          "Kryshtafovych A",
          "Schwede T",
          "Topf M",
          "et al."
        ],
        "year": 2023,
        "doi": "10.1002/prot.26617",
        "citations": 147
      },
      "official_url": "https://predictioncenter.org/casp15/",
      "github_url": "N/A",
      "leaderboard_url": "https://predictioncenter.org/casp15/results",
      "license": "Public",
      "first_release": "2022",
      "last_updated": "2023",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 3,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Biennial. Introduced ligand prediction category.",
      "related_benchmarks": [
        "casp16",
        "cameo-targets"
      ],
      "expert_ids": [
        "andriy-kryshtafovych",
        "john-moult",
        "torsten-schwede"
      ],
      "group_ids": [
        "prediction-center-ucd"
      ],
      "hosted_by": [
        "casp"
      ],
      "composite_score": 94.9,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1002/prot.26617"
    },
    {
      "id": "casp16",
      "name": "CASP16",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "protein-general",
        "small-molecule"
      ],
      "task_types": [
        "structure-prediction",
        "ligand-pose"
      ],
      "description": "CASP16 (2024) \u2014 expanded multimer, RNA, and ligand prediction.",
      "size": {
        "targets": 140,
        "categories": 6
      },
      "primary_paper": {
        "title": "CASP16 preliminary overview",
        "authors": "CASP16 assessors",
        "year": 2024,
        "doi": "N/A \u2014 overview papers pending",
        "citations": 15
      },
      "official_url": "https://predictioncenter.org/casp16/",
      "github_url": "N/A",
      "leaderboard_url": "https://predictioncenter.org/casp16/results",
      "license": "Public",
      "first_release": "2024",
      "last_updated": "2024-12",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "First full multimer+ligand+RNA joint eval.",
      "related_benchmarks": [
        "casp15"
      ],
      "expert_ids": [
        "andriy-kryshtafovych",
        "john-moult"
      ],
      "group_ids": [
        "prediction-center-ucd"
      ],
      "hosted_by": [
        "casp"
      ],
      "composite_score": 94.4,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/N/A \u2014 overview papers pending"
    },
    {
      "id": "cameo-targets",
      "name": "CAMEO weekly targets",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "protein-general",
        "small-molecule"
      ],
      "task_types": [
        "structure-prediction",
        "ligand-pose"
      ],
      "description": "Continuous weekly blind eval using pre-release PDB structures \u2014 3D, multimer, ligand pocket.",
      "size": {
        "targets_per_year": 1000,
        "categories": 4
      },
      "primary_paper": {
        "title": "CAMEO: continuous evaluation of computational biology methods",
        "authors": [
          "Haas J",
          "Roth S",
          "Arnold K",
          "et al."
        ],
        "year": 2013,
        "doi": "10.1093/database/bat031",
        "citations": 286
      },
      "official_url": "https://www.cameo3d.org/",
      "github_url": "N/A",
      "leaderboard_url": "https://www.cameo3d.org/",
      "license": "CC-BY 4.0",
      "first_release": "2013",
      "last_updated": "2025-05",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Weekly cadence complements biennial CASP.",
      "related_benchmarks": [
        "casp15",
        "casp16"
      ],
      "expert_ids": [
        "torsten-schwede",
        "jurgen-haas"
      ],
      "group_ids": [
        "biozentrum-basel"
      ],
      "hosted_by": [
        "cameo"
      ],
      "composite_score": 94.4,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1093/database/bat031"
    },
    {
      "id": "ord-bench",
      "name": "ORD Reaction Benchmark",
      "stages": [
        "developmental-candidate"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "reaction-prediction",
        "yield"
      ],
      "description": "Reaction benchmarks derived from the Open Reaction Database (yield prediction, condition recommendation).",
      "size": {
        "reactions": 2100000,
        "tasks": 4
      },
      "primary_paper": {
        "title": "The Open Reaction Database",
        "authors": [
          "Kearnes SM",
          "Maser MR",
          "Wleklinski M",
          "et al."
        ],
        "year": 2021,
        "doi": "10.1021/jacs.1c09820",
        "citations": 232
      },
      "official_url": "https://open-reaction-database.org/",
      "github_url": "https://github.com/open-reaction-database",
      "leaderboard_url": "N/A",
      "license": "CC-BY-SA 4.0",
      "first_release": "2021-07",
      "last_updated": "2025-04",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Modern open reaction corpus; industry-scale.",
      "related_benchmarks": [
        "uspto-retrosyn"
      ],
      "expert_ids": [
        "steven-kearnes",
        "abigail-doyle",
        "connor-coley"
      ],
      "group_ids": [
        "google-research",
        "princeton-chemistry",
        "coley-lab"
      ],
      "hosted_by": [
        "open-reaction-database"
      ],
      "composite_score": 93.9,
      "experimental_validation": "retrospective",
      "dataset_url": "https://open-reaction-database.org/client",
      "paper_url": "https://doi.org/10.1021/jacs.1c09820"
    },
    {
      "id": "openproblems-perturbation",
      "name": "Open Problems: Perturbation Prediction",
      "stages": [
        "virtual-cell"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "perturbation-prediction"
      ],
      "description": "NeurIPS competition & continuing benchmark for single-cell perturbation response prediction under distribution shift.",
      "size": {
        "cells": 240000,
        "perturbations": 144
      },
      "primary_paper": {
        "title": "Predicting Cellular Responses to Novel Drug Perturbations at a Single-Cell Resolution",
        "authors": [
          "Hetzel L",
          "Boehm S",
          "Kilbertus N",
          "et al."
        ],
        "year": 2022,
        "doi": "10.48550/arXiv.2204.13545",
        "citations": 180
      },
      "official_url": "https://openproblems.bio/",
      "github_url": "https://github.com/openproblems-bio/openproblems",
      "leaderboard_url": "https://openproblems.bio/results",
      "license": "MIT",
      "first_release": "2021-06",
      "last_updated": "2024-12",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Best-in-class rigor (Viash workflow, hidden test, NeurIPS track).",
      "related_benchmarks": [
        "cz-virtual-cell-challenge",
        "scperturb",
        "lincs-l1000"
      ],
      "expert_ids": [
        "fabian-theis",
        "daniel-burkhardt",
        "malte-luecken"
      ],
      "group_ids": [
        "openproblems",
        "helmholtz-munich",
        "czi-science"
      ],
      "hosted_by": [
        "openproblems",
        "czi-virtual-cell"
      ],
      "composite_score": 91.9,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.48550/arXiv.2204.13545"
    },
    {
      "id": "primekg",
      "name": "PrimeKG",
      "stages": [
        "disease-modeling",
        "target-id"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "knowledge-graph",
        "drug-repurposing"
      ],
      "description": "Precision medicine knowledge graph integrating 20 sources for 17k diseases.",
      "size": {
        "nodes": 129375,
        "edges": 8100498,
        "diseases": 17080
      },
      "primary_paper": {
        "title": "Building a knowledge graph to enable precision medicine",
        "authors": [
          "Chandak P",
          "Huang K",
          "Zitnik M"
        ],
        "year": 2023,
        "doi": "10.1038/s41597-023-01960-3",
        "citations": 515
      },
      "official_url": "https://zitniklab.hms.harvard.edu/projects/PrimeKG/",
      "github_url": "https://github.com/mims-harvard/PrimeKG",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2023-03",
      "last_updated": "2025-02",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Modern, well-engineered KG; strong for GNN drug repurposing.",
      "related_benchmarks": [
        "open-targets",
        "disgenet"
      ],
      "expert_ids": [
        "marinka-zitnik",
        "payal-chandak",
        "kexin-huang"
      ],
      "group_ids": [
        "zitnik-lab"
      ],
      "hosted_by": [
        "tdc"
      ],
      "composite_score": 91.9,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1038/s41597-023-01960-3"
    },
    {
      "id": "faers-bench",
      "name": "FAERS (raw)",
      "stages": [
        "post-market-rwe"
      ],
      "modalities": [
        "small-molecule",
        "biologic-mab"
      ],
      "task_types": [
        "pharmacovigilance"
      ],
      "description": "FDA Adverse Event Reporting System \u2014 19M+ reports used as substrate for signal-detection ML.",
      "size": {
        "reports": 19000000
      },
      "primary_paper": {
        "title": "An assessment of the U.S. FDA Adverse Event Reporting System (FAERS) and the impact of quality reporting",
        "authors": [
          "Sakaeda T",
          "Tamon A",
          "Kadoyama K",
          "Okuno Y"
        ],
        "year": 2013,
        "doi": "10.3390/ijerph100300796",
        "citations": 780
      },
      "official_url": "https://open.fda.gov/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Public domain",
      "first_release": "1969",
      "last_updated": "2025-Q2",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Known under-/over-reporting biases.",
      "related_benchmarks": [
        "offsides-twosides",
        "sider"
      ],
      "expert_ids": [
        "FDA CDER"
      ],
      "group_ids": [
        "fda-cder"
      ],
      "hosted_by": [
        "faers"
      ],
      "composite_score": 91.1,
      "experimental_validation": "retrospective",
      "dataset_url": "https://fis.fda.gov/extensions/FPD-QDE-FAERS/FPD-QDE-FAERS.html",
      "paper_url": "https://doi.org/10.3390/ijerph100300796"
    },
    {
      "id": "longevity-bench-insilico",
      "name": "Longevity Benchmark (Insilico)",
      "stages": [
        "disease-modeling",
        "target-id",
        "post-market-rwe"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "aging-prediction",
        "survival",
        "biomarker"
      ],
      "description": "19 aging/longevity benchmarks on ScienceAIBench/InsilicoBench/DDB \u2014 NHANES mortality, TCGA survival, methylation age, GTEx, Olink proteomic, longevity synergy.",
      "size": {
        "benchmarks": 19,
        "datasets": [
          "NHANES",
          "TCGA",
          "Methylation",
          "GTEx",
          "Olink"
        ]
      },
      "primary_paper": {
        "title": "Longevity Benchmark methodology",
        "authors": "Insilico AI Team",
        "year": 2025,
        "doi": "N/A \u2014 portal",
        "citations": 12
      },
      "official_url": "https://insilicobench.insilico.com/",
      "github_url": "N/A",
      "leaderboard_url": "https://insilicobench.insilico.com/",
      "license": "CC-BY",
      "first_release": "2025-02",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Unique, broad longevity/aging benchmark slice \u2014 nothing else in the field covers aging comparably. Leaderboard features frontier LLMs.",
      "related_benchmarks": [
        "depmap",
        "mimic-benchmark"
      ],
      "expert_ids": [
        "alex-zhavoronkov",
        "alex-aliper",
        "alex-zhebrak"
      ],
      "group_ids": [
        "insilico-medicine"
      ],
      "hosted_by": [
        "insilico-scienceaibench",
        "insilico-insilicobench",
        "insilico-ddb"
      ],
      "composite_score": 90.6,
      "experimental_validation": "prospective",
      "dataset_url": "https://insilico.com/mmai",
      "paper_url": "https://doi.org/N/A \u2014 portal"
    },
    {
      "id": "lincs-l1000",
      "name": "LINCS L1000 / CMap",
      "stages": [
        "virtual-cell",
        "disease-modeling",
        "target-id"
      ],
      "modalities": [
        "small-molecule",
        "cross-modality"
      ],
      "task_types": [
        "signature-match",
        "mechanism-of-action"
      ],
      "description": "1.3M transcriptional profiles across 978 landmark genes after genetic or chemical perturbations.",
      "size": {
        "profiles": 1319138,
        "compounds": 33000,
        "cell_lines": 82
      },
      "primary_paper": {
        "title": "A Next Generation Connectivity Map: L1000 Platform and the First 1,000,000 Profiles",
        "authors": [
          "Subramanian A",
          "Narayan R",
          "Corsello SM",
          "et al."
        ],
        "year": 2017,
        "doi": "10.1016/j.cell.2017.10.049",
        "citations": 3700
      },
      "official_url": "https://clue.io/",
      "github_url": "https://github.com/cmap",
      "leaderboard_url": "N/A",
      "license": "CC-BY",
      "first_release": "2017-11",
      "last_updated": "2024-03",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 3,
        "adoption": 5,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Foundational pharma resource for MoA work. Batch effects require careful handling.",
      "related_benchmarks": [
        "scperturb",
        "depmap"
      ],
      "expert_ids": [
        "aravind-subramanian",
        "todd-golub"
      ],
      "group_ids": [
        "broad-cmap"
      ],
      "hosted_by": [],
      "composite_score": 89.9,
      "experimental_validation": "wet-lab-confirmed",
      "dataset_url": "https://clue.io/data",
      "paper_url": "https://doi.org/10.1016/j.cell.2017.10.049"
    },
    {
      "id": "cansar",
      "name": "canSAR",
      "stages": [
        "target-id",
        "hit-id"
      ],
      "modalities": [
        "small-molecule",
        "protein-general"
      ],
      "task_types": [
        "druggability",
        "target-annotation"
      ],
      "description": "Cancer translational knowledge base integrating pharmacology, bioactivity, structure, druggability.",
      "size": {
        "proteins": 25000,
        "compounds": 11000000,
        "bioactivity_datapoints": 106000000
      },
      "primary_paper": {
        "title": "canSAR: update to the cancer translational research and drug discovery knowledgebase",
        "authors": [
          "Mitsopoulos C",
          "Di Micco P",
          "et al."
        ],
        "year": 2021,
        "doi": "10.1093/nar/gkaa1059",
        "citations": 210
      },
      "official_url": "https://cansar.ai/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Academic free / commercial tier",
      "first_release": "2011",
      "last_updated": "2024-09",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 3,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Deep oncology focus; widely-used druggability predictor.",
      "related_benchmarks": [
        "open-targets",
        "depmap",
        "chembl"
      ],
      "expert_ids": [
        "bissan-al-lazikani"
      ],
      "group_ids": [
        "icr-london"
      ],
      "hosted_by": [],
      "composite_score": 89.4,
      "experimental_validation": "wet-lab-confirmed",
      "dataset_url": "https://cansar.ai/",
      "paper_url": "https://doi.org/10.1093/nar/gkaa1059"
    },
    {
      "id": "mimic-benchmark",
      "name": "MIMIC-IV Benchmark Tasks",
      "stages": [
        "phase-iii",
        "clinical-development",
        "post-market-rwe"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "clinical-outcome"
      ],
      "description": "Standardized ICU benchmarks on MIMIC-IV \u2014 mortality, LOS, sepsis, AKI, drug dosing.",
      "size": {
        "patients": 299712,
        "tasks": 14
      },
      "primary_paper": {
        "title": "MIMIC-IV, a freely accessible electronic health record dataset",
        "authors": [
          "Johnson AEW",
          "Bulgarelli L",
          "Shen L",
          "et al."
        ],
        "year": 2023,
        "doi": "10.1038/s41597-022-01899-x",
        "citations": 2832
      },
      "official_url": "https://physionet.org/content/mimiciv/",
      "github_url": "https://github.com/MIT-LCP/mimic-code",
      "leaderboard_url": "N/A",
      "license": "PhysioNet Credentialed",
      "first_release": "2020",
      "last_updated": "2025-02",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Canonical clinical ML benchmark. Credentialed access limits casual use.",
      "related_benchmarks": [
        "ctod"
      ],
      "expert_ids": [
        "alistair-johnson",
        "leo-anthony-celi",
        "roger-mark"
      ],
      "group_ids": [
        "mit-lcp"
      ],
      "hosted_by": [
        "mimic"
      ],
      "composite_score": 89.4,
      "experimental_validation": "clinical",
      "dataset_url": "https://physionet.org/content/mimiciv/",
      "paper_url": "https://doi.org/10.1038/s41597-022-01899-x"
    },
    {
      "id": "scperturb",
      "name": "scPerturb",
      "stages": [
        "virtual-cell",
        "target-id"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "perturbation-prediction"
      ],
      "description": "Harmonized single-cell perturbation datasets (genetic + chemical) with standardized metadata.",
      "size": {
        "cells": 1700000,
        "datasets": 44
      },
      "primary_paper": {
        "title": "scPerturb: Harmonized single-cell perturbation data",
        "authors": [
          "Peidli S",
          "Green TD",
          "Shen C",
          "Theis FJ",
          "et al."
        ],
        "year": 2024,
        "doi": "10.1038/s41592-023-02144-y",
        "citations": 280
      },
      "official_url": "http://projects.sanderlab.org/scperturb/",
      "github_url": "https://github.com/sanderlab/scPerturb",
      "leaderboard_url": "N/A \u2014 dataset resource",
      "license": "MIT",
      "first_release": "2023-05",
      "last_updated": "2024-11",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Canonical harmonized resource. Strong Perturb-seq coverage; weaker for chemical perturbations.",
      "related_benchmarks": [
        "cz-virtual-cell-challenge",
        "openproblems-perturbation",
        "lincs-l1000"
      ],
      "expert_ids": [
        "chris-sander",
        "fabian-theis"
      ],
      "group_ids": [
        "sander-lab",
        "helmholtz-munich"
      ],
      "hosted_by": [],
      "composite_score": 88.9,
      "experimental_validation": "retrospective",
      "dataset_url": "http://projects.sanderlab.org/scperturb/",
      "paper_url": "https://doi.org/10.1038/s41592-023-02144-y"
    },
    {
      "id": "pinder",
      "name": "PINDER",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "protein-protein-docking"
      ],
      "description": "Protein-protein docking benchmark with rigorous split design.",
      "size": {
        "dimers": 2319564,
        "systems": 267498
      },
      "primary_paper": {
        "title": "PINDER: The Protein Interaction Dataset and Evaluation Resource",
        "authors": [
          "Kovtun D",
          "Akdel M",
          "Goncearenco A",
          "et al."
        ],
        "year": 2024,
        "doi": "10.1101/2024.07.17.603980",
        "citations": 45
      },
      "official_url": "https://www.pinder.sh/",
      "github_url": "https://github.com/pinder-org/pinder",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2024-07",
      "last_updated": "2025-03",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 3,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Expected PPI docking standard.",
      "related_benchmarks": [
        "plinder",
        "capri-benchmark"
      ],
      "expert_ids": [
        "torsten-schwede"
      ],
      "group_ids": [
        "vantai",
        "biozentrum-basel"
      ],
      "hosted_by": [
        "plinder-initiative"
      ],
      "composite_score": 88.9,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1101/2024.07.17.603980"
    },
    {
      "id": "pmo",
      "name": "Practical Molecular Optimization (PMO)",
      "stages": [
        "lead-id-admet",
        "developmental-candidate"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "molecule-generation",
        "oracle"
      ],
      "description": "Budget-aware benchmark for goal-directed molecule optimization with 23 oracle functions.",
      "size": {
        "oracles": 23,
        "budget": 10000
      },
      "primary_paper": {
        "title": "Sample Efficiency Matters: A Benchmark for Practical Molecular Optimization",
        "authors": [
          "Gao W",
          "Fu T",
          "Sun J",
          "Coley CW"
        ],
        "year": 2022,
        "doi": "10.48550/arXiv.2206.12411",
        "citations": 260
      },
      "official_url": "https://github.com/wenhao-gao/mol_opt",
      "github_url": "https://github.com/wenhao-gao/mol_opt",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2022-06",
      "last_updated": "2024-10",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Sample-efficiency focus exposed shortcomings of reward-maxing methods.",
      "related_benchmarks": [
        "guacamol",
        "moses",
        "tdc-admet"
      ],
      "expert_ids": [
        "wenhao-gao",
        "tianfan-fu",
        "connor-coley"
      ],
      "group_ids": [
        "mit-csail",
        "coley-lab"
      ],
      "hosted_by": [
        "tdc"
      ],
      "composite_score": 88.9,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.48550/arXiv.2206.12411"
    },
    {
      "id": "cov-abdab",
      "name": "CoV-AbDab",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "biologic-mab"
      ],
      "task_types": [
        "antibody-antigen-binding"
      ],
      "description": "Coronavirus antibody database \u2014 binding & neutralization annotations for SARS-CoV-2/MERS/etc.",
      "size": {
        "antibodies": 12000
      },
      "primary_paper": {
        "title": "CoV-AbDab: the coronavirus antibody database",
        "authors": [
          "Raybould MIJ",
          "Kovaltsuk A",
          "Marks C",
          "Deane CM"
        ],
        "year": 2021,
        "doi": "10.1093/bioinformatics/btaa739",
        "citations": 220
      },
      "official_url": "https://opig.stats.ox.ac.uk/webapps/covabdab/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2020",
      "last_updated": "2024-12",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Narrow modality but critical for pandemic-preparedness ML.",
      "related_benchmarks": [
        "sabdab",
        "oas"
      ],
      "expert_ids": [
        "matthew-raybould",
        "charlotte-deane"
      ],
      "group_ids": [
        "oxford-opig"
      ],
      "hosted_by": [],
      "composite_score": 88.9,
      "experimental_validation": "retrospective",
      "dataset_url": "http://opig.stats.ox.ac.uk/webapps/covabdab/",
      "paper_url": "https://doi.org/10.1093/bioinformatics/btaa739"
    },
    {
      "id": "pubchem-bioassay",
      "name": "PubChem BioAssay",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "bioactivity"
      ],
      "description": "NIH PubChem's screening assay repository.",
      "size": {
        "assays": 1700000,
        "compounds": 114000000
      },
      "primary_paper": {
        "title": "PubChem 2023 update",
        "authors": [
          "Kim S",
          "Chen J",
          "Cheng T",
          "et al."
        ],
        "year": 2023,
        "doi": "10.1093/nar/gkac956",
        "citations": 2756
      },
      "official_url": "https://pubchem.ncbi.nlm.nih.gov/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Public domain",
      "first_release": "2004",
      "last_updated": "2025-05",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Broadest HTS repository; quality heterogeneous.",
      "related_benchmarks": [
        "chembl",
        "moleculenet"
      ],
      "expert_ids": [
        "sunghwan-kim"
      ],
      "group_ids": [
        "nih-ncbi"
      ],
      "hosted_by": [],
      "composite_score": 88.6,
      "experimental_validation": "wet-lab-confirmed",
      "dataset_url": "https://pubchem.ncbi.nlm.nih.gov/bioassay/",
      "paper_url": "https://doi.org/10.1093/nar/gkac956"
    },
    {
      "id": "polaris-admet",
      "name": "Polaris ADMET",
      "stages": [
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "regression",
        "classification"
      ],
      "description": "Industry-contributed ADMET benchmarks on Polaris Hub (Novartis/AstraZeneca/Polaris SC curated endpoints).",
      "size": {
        "endpoints": 12,
        "molecules": 50000
      },
      "primary_paper": {
        "title": "Polaris: method comparison in drug discovery",
        "authors": [
          "Wognum C",
          "Noutahi E",
          "Hsu J",
          "et al."
        ],
        "year": 2024,
        "doi": "N/A \u2014 working paper",
        "citations": 30
      },
      "official_url": "https://polarishub.io/benchmarks",
      "github_url": "https://github.com/polaris-hub/polaris",
      "leaderboard_url": "https://polarishub.io/benchmarks",
      "license": "Polaris Community",
      "first_release": "2023-10",
      "last_updated": "2025-04",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 3,
        "quality": 5,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Industry splits enforce blinded eval; highest industry relevance among ADMET benchmarks.",
      "related_benchmarks": [
        "tdc-admet",
        "moleculenet"
      ],
      "expert_ids": [
        "cas-wognum",
        "emmanuel-noutahi",
        "jonathan-hsu"
      ],
      "group_ids": [
        "valence-labs",
        "recursion",
        "novartis-nibr",
        "astrazeneca"
      ],
      "hosted_by": [
        "polaris"
      ],
      "composite_score": 88.4,
      "experimental_validation": "prospective",
      "huggingface_url": "https://huggingface.co/valence-labs",
      "paper_url": "https://doi.org/N/A \u2014 working paper"
    },
    {
      "id": "cz-virtual-cell-challenge",
      "name": "CZ Virtual Cell Challenge",
      "stages": [
        "virtual-cell",
        "target-id"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "perturbation-prediction"
      ],
      "description": "Open challenge to predict transcriptomic responses to genetic & chemical perturbations across cell types, hosted by CZ Biohub.",
      "size": {
        "cells": 2000000,
        "perturbations": 300
      },
      "primary_paper": {
        "title": "CZ Virtual Cell Challenge (launch)",
        "authors": [
          "CZI Science Team"
        ],
        "year": 2024,
        "doi": "N/A \u2014 consortium launch",
        "citations": 40
      },
      "official_url": "https://virtualcellchallenge.org/",
      "github_url": "https://github.com/czbiohub-sf",
      "leaderboard_url": "https://virtualcellchallenge.org/leaderboard",
      "license": "CC-BY",
      "first_release": "2024-11",
      "last_updated": "2025-09",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Gold standard-in-the-making for foundation-model era perturbation prediction. Hidden test \u2192 strong against leakage.",
      "related_benchmarks": [
        "scperturb",
        "openproblems-perturbation",
        "perturbbench"
      ],
      "expert_ids": [
        "stephen-quake",
        "ambrose-carr"
      ],
      "group_ids": [
        "cz-biohub",
        "czi-science"
      ],
      "hosted_by": [
        "czi-virtual-cell"
      ],
      "composite_score": 88.1,
      "experimental_validation": "prospective",
      "paper_url": "https://doi.org/N/A \u2014 consortium launch"
    },
    {
      "id": "pli-gpcr-suite",
      "name": "ISM Benchmarks: GPCRs (Insilico)",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule",
        "protein-general"
      ],
      "task_types": [
        "binding-affinity"
      ],
      "description": "87-benchmark GPCR affinity suite on ScienceAIBench / InsilicoBench / DDB \u2014 kinome-scale coverage of class A/B/C GPCRs.",
      "size": {
        "benchmarks": 87,
        "gpcr_targets": 87
      },
      "primary_paper": {
        "title": "GPCR affinity benchmark methodology",
        "authors": "Insilico AI Team",
        "year": 2025,
        "doi": "N/A \u2014 portal",
        "citations": 8
      },
      "official_url": "https://scienceaibench.insilico.com/",
      "github_url": "N/A",
      "leaderboard_url": "https://scienceaibench.insilico.com/",
      "license": "CC-BY",
      "first_release": "2025-02",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Largest open GPCR affinity benchmark. Leaderboards test external frontier LLMs \u2014 not self-referential.",
      "related_benchmarks": [
        "pdbbind",
        "chembl"
      ],
      "expert_ids": [
        "alex-zhavoronkov",
        "alex-aliper",
        "alex-zhebrak"
      ],
      "group_ids": [
        "insilico-medicine"
      ],
      "hosted_by": [
        "insilico-scienceaibench",
        "insilico-insilicobench",
        "insilico-ddb"
      ],
      "composite_score": 87.6,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/N/A \u2014 portal"
    },
    {
      "id": "capri-benchmark",
      "name": "CAPRI Rounds",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "protein-protein-docking"
      ],
      "description": "56 rounds of blind protein-protein (and peptide, ligand) complex prediction.",
      "size": {
        "rounds": 56,
        "targets": 300
      },
      "primary_paper": {
        "title": "Modeling protein-protein and protein-peptide complexes: CAPRI 6th edition",
        "authors": [
          "Lensink MF",
          "Nadzirin N",
          "Velankar S",
          "Wodak SJ"
        ],
        "year": 2020,
        "doi": "10.1002/prot.25870",
        "citations": 200
      },
      "official_url": "https://www.ebi.ac.uk/pdbe/complex-pred/capri/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Public",
      "first_release": "2001",
      "last_updated": "2024-11",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Oldest PPI prediction benchmark.",
      "related_benchmarks": [
        "pinder",
        "casp16"
      ],
      "expert_ids": [
        "marc-lensink",
        "shoshana-wodak"
      ],
      "group_ids": [
        "ebi",
        "ibm-brussels"
      ],
      "hosted_by": [
        "capri"
      ],
      "composite_score": 86.3,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1002/prot.25870"
    },
    {
      "id": "toxcast",
      "name": "ToxCast",
      "stages": [
        "lead-id-admet",
        "ind-enabling"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "toxicity"
      ],
      "description": "EPA's in vitro toxicity screening dataset \u2014 ~700 assays \u00d7 ~9k chemicals.",
      "size": {
        "assays": 700,
        "compounds": 9000
      },
      "primary_paper": {
        "title": "The ToxCast Program for Prioritizing Toxicity Testing of Environmental Chemicals",
        "authors": [
          "Dix DJ",
          "Houck KA",
          "Martin MT",
          "et al."
        ],
        "year": 2007,
        "doi": "10.1093/toxsci/kfm297",
        "citations": 1900
      },
      "official_url": "https://www.epa.gov/comptox-tools/exploring-toxcast-data",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Public",
      "first_release": "2007",
      "last_updated": "2024-06",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Regulatory-grade broad tox dataset.",
      "related_benchmarks": [
        "tox21"
      ],
      "expert_ids": [
        "richard-judson"
      ],
      "group_ids": [
        "epa-ccte"
      ],
      "hosted_by": [
        "tdc"
      ],
      "composite_score": 85.6,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1093/toxsci/kfm297"
    },
    {
      "id": "targetbench-insilico",
      "name": "TargetBench (Insilico)",
      "stages": [
        "target-id",
        "disease-modeling"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "target-prioritization"
      ],
      "description": "10 disease-area target identification benchmarks on ScienceAIBench/DDB (cancer, cardiovascular, endocrine/metabolic, fibrotic, inflammation/immunology, neuro, etc.).",
      "size": {
        "benchmarks": 10,
        "disease_areas": 10
      },
      "primary_paper": {
        "title": "TargetBench methodology",
        "authors": "Insilico AI Team",
        "year": 2025,
        "doi": "N/A \u2014 portal",
        "citations": 5
      },
      "official_url": "https://scienceaibench.insilico.com/",
      "github_url": "N/A",
      "leaderboard_url": "https://scienceaibench.insilico.com/",
      "license": "CC-BY",
      "first_release": "2025-03",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Disease-organized target ID benchmark \u2014 unique axis. Frontier LLM leaderboard.",
      "related_benchmarks": [
        "open-targets",
        "depmap"
      ],
      "expert_ids": [
        "alex-zhavoronkov",
        "alex-aliper"
      ],
      "group_ids": [
        "insilico-medicine"
      ],
      "hosted_by": [
        "insilico-scienceaibench",
        "insilico-ddb"
      ],
      "composite_score": 84.6,
      "experimental_validation": "wet-lab-confirmed",
      "paper_url": "https://doi.org/10.1038/s44335-025-00039-1"
    },
    {
      "id": "ism-admet",
      "name": "ISM Benchmarks: ADMET (Insilico)",
      "stages": [
        "lead-id-admet",
        "ind-enabling"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "admet-regression",
        "admet-classification"
      ],
      "description": "28-endpoint ADMET benchmark suite on ScienceAIBench/InsilicoBench/DDB.",
      "size": {
        "benchmarks": 28
      },
      "primary_paper": {
        "title": "ISM ADMET benchmark methodology",
        "authors": "Insilico AI Team",
        "year": 2025,
        "doi": "N/A \u2014 portal",
        "citations": 7
      },
      "official_url": "https://scienceaibench.insilico.com/",
      "github_url": "N/A",
      "leaderboard_url": "https://scienceaibench.insilico.com/",
      "license": "CC-BY",
      "first_release": "2025-02",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Broader endpoint coverage than TDC ADMET. Side-by-side with TDC mirror on DDB.",
      "related_benchmarks": [
        "tdc-admet",
        "admet-ai"
      ],
      "expert_ids": [
        "alex-zhavoronkov",
        "alex-aliper"
      ],
      "group_ids": [
        "insilico-medicine"
      ],
      "hosted_by": [
        "insilico-scienceaibench",
        "insilico-insilicobench",
        "insilico-ddb"
      ],
      "composite_score": 84.6,
      "experimental_validation": "wet-lab-confirmed",
      "paper_url": "https://doi.org/N/A \u2014 portal"
    },
    {
      "id": "cafa5",
      "name": "CAFA5",
      "stages": [
        "target-id"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "function-annotation"
      ],
      "description": "CAFA 5th edition \u2014 blind GO annotation eval (142k targets; Kaggle 2023, 1625 teams).",
      "size": {
        "targets": 142000,
        "go_terms": 43000
      },
      "primary_paper": {
        "title": "The CAFA challenge reports improved protein function prediction and new functional annotations for hundreds of genes through experimental screens",
        "authors": [
          "Zhou N",
          "Jiang Y",
          "Bergquist TR",
          "et al."
        ],
        "year": 2019,
        "doi": "10.1186/s13059-019-1835-8",
        "citations": 419
      },
      "official_url": "https://biofunctionprediction.org/cafa/",
      "github_url": "N/A",
      "leaderboard_url": "https://biofunctionprediction.org/cafa/results",
      "license": "Public",
      "first_release": "2022",
      "last_updated": "2023-12",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "CAFA5 broke attendance records.",
      "related_benchmarks": [
        "proteingym"
      ],
      "expert_ids": [
        "predrag-radivojac",
        "iddo-friedberg"
      ],
      "group_ids": [
        "cafa-consortium"
      ],
      "hosted_by": [
        "cafa"
      ],
      "composite_score": 84.3,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1186/s13059-019-1835-8"
    },
    {
      "id": "molecule-ace",
      "name": "MoleculeACE",
      "stages": [
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "regression",
        "activity-cliff"
      ],
      "description": "Benchmark testing model robustness on activity cliffs across 30 ChEMBL targets.",
      "size": {
        "targets": 30,
        "molecules": 48000
      },
      "primary_paper": {
        "title": "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs",
        "authors": [
          "van Tilborg D",
          "Alenicheva A",
          "Grisoni F"
        ],
        "year": 2022,
        "doi": "10.1021/acs.jcim.2c01073",
        "citations": 233
      },
      "official_url": "https://github.com/molML/MoleculeACE",
      "github_url": "https://github.com/molML/MoleculeACE",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2022-11",
      "last_updated": "2024-05",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 3,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Critical stress-test for generalization; exposed GNN weaknesses.",
      "related_benchmarks": [
        "tdc-admet",
        "moleculenet"
      ],
      "expert_ids": [
        "francesca-grisoni",
        "derek-van-tilborg"
      ],
      "group_ids": [
        "tue-eindhoven"
      ],
      "hosted_by": [],
      "composite_score": 83.3,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1021/acs.jcim.2c01073"
    },
    {
      "id": "matbench",
      "name": "MatBench",
      "stages": [
        "developmental-candidate"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "materials-property"
      ],
      "description": "Materials property prediction benchmark suite (used for some chemistry-adjacent ML).",
      "size": {
        "tasks": 13,
        "samples": 132000
      },
      "primary_paper": {
        "title": "Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm",
        "authors": [
          "Dunn A",
          "Wang Q",
          "Ganose A",
          "Dopp D",
          "Jain A"
        ],
        "year": 2020,
        "doi": "10.1038/s41524-020-00406-3",
        "citations": 434
      },
      "official_url": "https://matbench.materialsproject.org/",
      "github_url": "https://github.com/materialsproject/matbench",
      "leaderboard_url": "https://matbench.materialsproject.org/Leaderboards%20Per-Task/",
      "license": "MIT",
      "first_release": "2020",
      "last_updated": "2025-02",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Materials-science benchmark; relevant for formulation / co-crystal work.",
      "related_benchmarks": [],
      "expert_ids": [
        "anubhav-jain",
        "alex-dunn"
      ],
      "group_ids": [
        "materials-project",
        "lbl"
      ],
      "hosted_by": [
        "insilico-scienceaibench",
        "insilico-ddb"
      ],
      "composite_score": 83.3,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1038/s41524-020-00406-3"
    },
    {
      "id": "offsides-twosides",
      "name": "OffSides / TWOSIDES",
      "stages": [
        "post-market-rwe"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "adverse-event",
        "ddi"
      ],
      "description": "Statistically corrected single- and pair-wise drug adverse events derived from FAERS.",
      "size": {
        "offsides_signals": 438801,
        "twosides_combos": 870000
      },
      "primary_paper": {
        "title": "Data-driven prediction of drug effects and interactions",
        "authors": [
          "Tatonetti NP",
          "Ye PP",
          "Daneshjou R",
          "Altman RB"
        ],
        "year": 2012,
        "doi": "10.1126/scitranslmed.3003377",
        "citations": 1100
      },
      "official_url": "http://tatonettilab.org/offsides/",
      "github_url": "https://github.com/tatonetti-lab/nsides-release",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2012",
      "last_updated": "2023-09",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Key benchmark for DDI + adverse event ML.",
      "related_benchmarks": [
        "sider",
        "faers-bench"
      ],
      "expert_ids": [
        "nick-tatonetti",
        "russ-altman"
      ],
      "group_ids": [
        "tatonetti-lab"
      ],
      "hosted_by": [
        "faers",
        "tdc"
      ],
      "composite_score": 83.0,
      "experimental_validation": "retrospective",
      "dataset_url": "https://tatonettilab.org/resources/tatonetti-stm.html",
      "paper_url": "https://doi.org/10.1126/scitranslmed.3003377"
    },
    {
      "id": "clinbench-quarterly",
      "name": "ClinBench Quarterly (Insilico)",
      "stages": [
        "phase-ii",
        "phase-iii",
        "clinical-development"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "trial-outcome-prediction"
      ],
      "description": "Quarterly-refreshed clinical-trial outcome benchmark on ScienceAIBench / InsilicoBench / DDB (25 tasks).",
      "size": {
        "tasks": 25,
        "refresh_cadence_months": 3
      },
      "primary_paper": {
        "title": "ClinBench methodology note",
        "authors": "Insilico AI Team",
        "year": 2025,
        "doi": "N/A \u2014 portal",
        "citations": 10
      },
      "official_url": "https://scienceaibench.insilico.com/",
      "github_url": "N/A",
      "leaderboard_url": "https://scienceaibench.insilico.com/",
      "license": "CC-BY",
      "first_release": "2025-01",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 5,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Benchmark refresh cadence beats all academic trial outcome benchmarks. Leaderboards test frontier LLMs against quarterly-updated splits.",
      "related_benchmarks": [
        "hint-trialbench",
        "top-benchmark"
      ],
      "expert_ids": [
        "alex-zhavoronkov",
        "alex-aliper"
      ],
      "group_ids": [
        "insilico-medicine"
      ],
      "hosted_by": [
        "insilico-scienceaibench",
        "insilico-insilicobench",
        "insilico-ddb"
      ],
      "composite_score": 81.5,
      "experimental_validation": "clinical",
      "paper_url": "https://doi.org/N/A \u2014 portal"
    },
    {
      "id": "dockstring",
      "name": "DOCKSTRING",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "docking"
      ],
      "description": "260k ligands docked to 58 targets with AutoDock Vina; reproducible docking benchmark.",
      "size": {
        "ligands": 260155,
        "targets": 58
      },
      "primary_paper": {
        "title": "DOCKSTRING: easy molecular docking yields better benchmarks for ligand design",
        "authors": [
          "Garc\u00eda-Orteg\u00f3n M",
          "Simm GNC",
          "Tripp AJ",
          "et al."
        ],
        "year": 2022,
        "doi": "10.1021/acs.jcim.1c01334",
        "citations": 240
      },
      "official_url": "https://dockstring.github.io/",
      "github_url": "https://github.com/dockstring/dockstring",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2022-02",
      "last_updated": "2024-07",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Vina scores are a proxy; not a replacement for wet assays.",
      "related_benchmarks": [
        "lit-pcba",
        "pdbbind",
        "casf-2016"
      ],
      "expert_ids": [
        "jose-miguel-hernandez-lobato"
      ],
      "group_ids": [
        "cambridge-ml"
      ],
      "hosted_by": [],
      "composite_score": 81.3,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1021/acs.jcim.1c01334"
    },
    {
      "id": "disgenet",
      "name": "DisGeNET",
      "stages": [
        "disease-modeling",
        "target-id"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "gene-disease-association"
      ],
      "description": "Large collection of gene-disease associations (text-mining + curation + repositories).",
      "size": {
        "associations": 1134942,
        "genes": 21671,
        "diseases": 30170
      },
      "primary_paper": {
        "title": "The DisGeNET knowledge platform for disease genomics: 2019 update",
        "authors": [
          "Pi\u00f1ero J",
          "Ram\u00edrez-Anguita JM",
          "Sa\u00fcch-Pitarch J",
          "et al."
        ],
        "year": 2020,
        "doi": "10.1093/nar/gkz1021",
        "citations": 2480
      },
      "official_url": "https://www.disgenet.com/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY-NC / commercial tier",
      "first_release": "2010",
      "last_updated": "2024-11",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 5,
        "quality": 3,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "flags": [
        "license-gated-commercial"
      ],
      "notes": "Commercial license required for industry. Text-mining noise limits quality.",
      "related_benchmarks": [
        "open-targets",
        "primekg"
      ],
      "expert_ids": [
        "laura-furlong"
      ],
      "group_ids": [
        "medbioinformatics"
      ],
      "hosted_by": [
        "tdc"
      ],
      "composite_score": 81.0,
      "experimental_validation": "retrospective",
      "dataset_url": "https://www.disgenet.org/downloads",
      "paper_url": "https://doi.org/10.1093/nar/gkz1021"
    },
    {
      "id": "lit-pcba",
      "name": "LIT-PCBA",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "virtual-screening"
      ],
      "description": "Literature-curated dose-response PubChem BioAssay set; 15 targets with unbiased actives/inactives.",
      "size": {
        "targets": 15,
        "actives": 7844,
        "inactives": 2533936
      },
      "primary_paper": {
        "title": "LIT-PCBA: An Unbiased Data Set for Machine Learning and Virtual Screening",
        "authors": [
          "Tran-Nguyen VK",
          "Jacquemard C",
          "Rognan D"
        ],
        "year": 2020,
        "doi": "10.1021/acs.jcim.0c00155",
        "citations": 310
      },
      "official_url": "https://drugdesign.unistra.fr/LIT-PCBA/",
      "github_url": "https://github.com/ViktorTran-Nguyen/LIT-PCBA",
      "leaderboard_url": "N/A",
      "license": "CC-BY",
      "first_release": "2020-03",
      "last_updated": "2023-06",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Much fairer than DUD-E; small target count limits coverage.",
      "related_benchmarks": [
        "dude",
        "dekois",
        "dockstring"
      ],
      "expert_ids": [
        "didier-rognan"
      ],
      "group_ids": [
        "strasbourg-chemo"
      ],
      "hosted_by": [],
      "composite_score": 80.8,
      "experimental_validation": "retrospective",
      "dataset_url": "http://drugdesign.unistra.fr/LIT-PCBA/",
      "paper_url": "https://doi.org/10.1021/acs.jcim.0c00155"
    },
    {
      "id": "flip",
      "name": "FLIP",
      "stages": [
        "target-id",
        "developmental-candidate"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "fitness-prediction"
      ],
      "description": "Fitness landscape inference benchmarks with realistic train/test splits (AAV, GB1, Meltome, SCL, Bind).",
      "size": {
        "landscapes": 5,
        "splits": 15
      },
      "primary_paper": {
        "title": "FLIP: Benchmark tasks in fitness landscape inference for proteins",
        "authors": [
          "Dallago C",
          "Mou J",
          "Johnston KE",
          "et al."
        ],
        "year": 2021,
        "doi": "10.48550/arXiv.2112.06661",
        "citations": 120
      },
      "official_url": "https://benchmark.protein.properties/",
      "github_url": "https://github.com/J-SNACKKB/FLIP",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2021-12",
      "last_updated": "2024-05",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 3,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Complements ProteinGym (smaller but carefully designed splits).",
      "related_benchmarks": [
        "proteingym"
      ],
      "expert_ids": [
        "burkhard-rost",
        "mohammed-alquraishi",
        "christian-dallago"
      ],
      "group_ids": [
        "rostlab-tum",
        "alquraishi-lab"
      ],
      "hosted_by": [
        "flip"
      ],
      "composite_score": 80.8,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.48550/arXiv.2112.06661"
    },
    {
      "id": "cptac-proteogenomic",
      "name": "CPTAC Proteogenomic Benchmarks",
      "stages": [
        "disease-modeling",
        "target-id",
        "phase-ii"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "biomarker-discovery"
      ],
      "description": "Proteogenomic benchmarks across 10 tumor types \u2014 drives DREAM proteogenomic challenges.",
      "size": {
        "samples": 1600,
        "tumor_types": 10,
        "omics_layers": 6
      },
      "primary_paper": {
        "title": "Proteogenomic Characterization of Cancer Types (CPTAC overview)",
        "authors": [
          "Zhang B",
          "Wang J",
          "Wang X",
          "et al."
        ],
        "year": 2014,
        "doi": "10.1038/nature13438",
        "citations": 1326
      },
      "official_url": "https://proteomics.cancer.gov/programs/cptac",
      "github_url": "https://github.com/PayneLab/cptac",
      "leaderboard_url": "N/A",
      "license": "dbGaP / public",
      "first_release": "2011",
      "last_updated": "2025-03",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Deep integrative oncology data.",
      "related_benchmarks": [
        "depmap"
      ],
      "expert_ids": [
        "henry-rodriguez",
        "amanda-paulovich",
        "bing-zhang"
      ],
      "group_ids": [
        "nci-cptac"
      ],
      "hosted_by": [
        "cptac"
      ],
      "composite_score": 80.8,
      "experimental_validation": "clinical",
      "dataset_url": "https://proteomic.datacommons.cancer.gov/pdc/",
      "paper_url": "https://doi.org/10.1038/nature13438"
    },
    {
      "id": "guacamol",
      "name": "GuacaMol",
      "stages": [
        "lead-id-admet",
        "developmental-candidate"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "molecule-generation"
      ],
      "description": "Goal-directed + distribution-learning benchmarks for molecular generative models.",
      "size": {
        "tasks": 20,
        "train_set": 1600000
      },
      "primary_paper": {
        "title": "GuacaMol: Benchmarking Models for de Novo Molecular Design",
        "authors": [
          "Brown N",
          "Fiscato M",
          "Segler MHS",
          "Vaucher AC"
        ],
        "year": 2019,
        "doi": "10.1021/acs.jcim.8b00839",
        "citations": 910
      },
      "official_url": "https://www.benevolent.com/guacamol",
      "github_url": "https://github.com/BenevolentAI/guacamol",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2019-03",
      "last_updated": "2022-07",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 2,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "First-generation generative benchmark; largely superseded by PMO for goal-directed.",
      "related_benchmarks": [
        "moses",
        "pmo"
      ],
      "expert_ids": [
        "marwin-segler",
        "nathan-brown"
      ],
      "group_ids": [
        "benevolent-ai"
      ],
      "hosted_by": [
        "moleculenet",
        "tdc"
      ],
      "composite_score": 80.5,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1021/acs.jcim.8b00839"
    },
    {
      "id": "pksim",
      "name": "Open Systems Pharmacology / PK-Sim",
      "stages": [
        "phase-i",
        "ind-enabling"
      ],
      "modalities": [
        "small-molecule",
        "biologic-mab"
      ],
      "task_types": [
        "pbpk-validation"
      ],
      "description": "OSP Suite \u2014 open PBPK/QSP models and validation sets.",
      "size": {
        "models": 100,
        "validated_compounds": 50
      },
      "primary_paper": {
        "title": "The Open Systems Pharmacology Suite: a new era in PBPK modeling",
        "authors": [
          "Lippert J",
          "Burghaus R",
          "Edginton A",
          "et al."
        ],
        "year": 2019,
        "doi": "10.1002/psp4.12386",
        "citations": 180
      },
      "official_url": "https://www.open-systems-pharmacology.org/",
      "github_url": "https://github.com/Open-Systems-Pharmacology",
      "leaderboard_url": "N/A",
      "license": "GPL-2.0",
      "first_release": "2017",
      "last_updated": "2025-01",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Open alternative to Simcyp.",
      "related_benchmarks": [
        "simcyp-validation"
      ],
      "expert_ids": [
        "andrea-edginton",
        "joerg-lippert"
      ],
      "group_ids": [
        "osp-consortium"
      ],
      "hosted_by": [],
      "composite_score": 80.3,
      "experimental_validation": "retrospective",
      "dataset_url": "https://www.open-systems-pharmacology.org/",
      "paper_url": "https://doi.org/10.1002/psp4.12386"
    },
    {
      "id": "admet-ai",
      "name": "ADMET-AI",
      "stages": [
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "regression",
        "classification"
      ],
      "description": "Graph-based ADMET prediction + benchmark on 41 endpoints leveraging ChEMBL + TDC.",
      "size": {
        "endpoints": 41,
        "molecules": 90000
      },
      "primary_paper": {
        "title": "ADMET-AI: a machine learning ADMET platform for evaluation of large-scale chemical libraries",
        "authors": [
          "Swanson K",
          "Walther P",
          "Leitz J",
          "et al."
        ],
        "year": 2023,
        "doi": "10.1093/bioinformatics/btae416",
        "citations": 85
      },
      "official_url": "https://admet.ai.greenstonebio.com/",
      "github_url": "https://github.com/swansonk14/admet_ai",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2023-11",
      "last_updated": "2024-12",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Strong baselines + web tool; builds on TDC.",
      "related_benchmarks": [
        "tdc-admet",
        "moleculenet"
      ],
      "expert_ids": [
        "kyle-swanson",
        "regina-barzilay"
      ],
      "group_ids": [
        "mit-csail",
        "barzilay-lab"
      ],
      "hosted_by": [],
      "composite_score": 79.5,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1093/bioinformatics/btae416"
    },
    {
      "id": "ames",
      "name": "AMES (mutagenicity)",
      "stages": [
        "ind-enabling",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "toxicity-classification"
      ],
      "description": "AMES bacterial mutagenicity benchmark \u2014 standard gentox endpoint.",
      "size": {
        "molecules": 7278
      },
      "primary_paper": {
        "title": "Therapeutics Data Commons",
        "authors": [
          "Huang K",
          "Fu T",
          "et al."
        ],
        "year": 2021,
        "doi": "10.48550/arXiv.2102.09548",
        "citations": 620
      },
      "official_url": "https://tdcommons.ai/single_pred_tasks/tox/#ames",
      "github_url": "https://github.com/mims-harvard/TDC",
      "leaderboard_url": "https://tdcommons.ai/benchmark/admet_group/22ames/",
      "license": "MIT",
      "first_release": "2021-02",
      "last_updated": "2025-01",
      "rubric": {
        "rigor": 4,
        "coverage": 2,
        "maintenance": 4,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Core gentox endpoint.",
      "related_benchmarks": [
        "tdc-admet",
        "tox21"
      ],
      "expert_ids": [
        "kexin-huang"
      ],
      "group_ids": [
        "tdc"
      ],
      "hosted_by": [
        "tdc",
        "insilico-ddb"
      ],
      "composite_score": 79.5,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.48550/arXiv.2102.09548"
    },
    {
      "id": "polaris-biologics",
      "name": "Polaris Biologics (Polyreactivity / SEC / Tm)",
      "stages": [
        "developmental-candidate"
      ],
      "modalities": [
        "biologic-mab"
      ],
      "task_types": [
        "developability"
      ],
      "description": "Polaris Hub biologics benchmarks: polyreactivity, SEC-SMAC, Tm + titer.",
      "size": {
        "tasks": 6,
        "antibodies": 2000
      },
      "primary_paper": {
        "title": "Polaris biologics method comparison",
        "authors": "Wognum C et al.",
        "year": 2024,
        "doi": "N/A",
        "citations": 15
      },
      "official_url": "https://polarishub.io/benchmarks",
      "github_url": "https://github.com/polaris-hub/polaris",
      "leaderboard_url": "https://polarishub.io/benchmarks",
      "license": "Polaris Community",
      "first_release": "2024-04",
      "last_updated": "2025-03",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 5,
        "adoption": 3,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Industry-donated; growing.",
      "related_benchmarks": [
        "polaris-admet"
      ],
      "expert_ids": [
        "cas-wognum"
      ],
      "group_ids": [
        "valence-labs",
        "recursion"
      ],
      "hosted_by": [
        "polaris",
        "insilico-scienceaibench",
        "insilico-ddb"
      ],
      "composite_score": 79.0,
      "experimental_validation": "prospective",
      "paper_url": "https://doi.org/N/A"
    },
    {
      "id": "moleculenet",
      "name": "MoleculeNet",
      "stages": [
        "lead-id-admet",
        "hit-id"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "regression",
        "classification",
        "quantum"
      ],
      "description": "Multi-task molecular ML benchmark covering quantum, physical, biophysical, physiological properties.",
      "size": {
        "tasks": 17,
        "molecules": 700000
      },
      "primary_paper": {
        "title": "MoleculeNet: A Benchmark for Molecular Machine Learning",
        "authors": [
          "Wu Z",
          "Ramsundar B",
          "Feinberg EN",
          "et al."
        ],
        "year": 2018,
        "doi": "10.1039/C7SC02664A",
        "citations": 3600
      },
      "official_url": "https://moleculenet.org/",
      "github_url": "https://github.com/deepchem/deepchem",
      "leaderboard_url": "https://moleculenet.org/full-results",
      "license": "MIT",
      "first_release": "2018-03",
      "last_updated": "2023-11",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 2,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [
        "data-leakage-known"
      ],
      "notes": "Widely cited (3600+); aging splits with known scaffold leakage.",
      "related_benchmarks": [
        "tdc-admet",
        "polaris-admet"
      ],
      "expert_ids": [
        "bharath-ramsundar",
        "vijay-pande"
      ],
      "group_ids": [
        "pande-lab",
        "deepchem"
      ],
      "hosted_by": [
        "moleculenet",
        "deepchem",
        "papers-with-code-drug"
      ],
      "composite_score": 78.0,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1039/C7SC02664A"
    },
    {
      "id": "uspto-retrosyn",
      "name": "USPTO-50K / USPTO-MIT (Retrosynthesis)",
      "stages": [
        "lead-id-admet",
        "developmental-candidate"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "retrosynthesis",
        "reaction-prediction"
      ],
      "description": "Reactions extracted from USPTO patents; standard retrosynthesis/forward-reaction benchmark.",
      "size": {
        "reactions": 1800000,
        "canonical_50k": 50037
      },
      "primary_paper": {
        "title": "Neural Sequence-to-Sequence Models for Retrosynthesis Prediction",
        "authors": [
          "Liu B",
          "Ramsundar B",
          "Kawthekar P",
          "et al."
        ],
        "year": 2017,
        "doi": "10.1021/acscentsci.7b00303",
        "citations": 520
      },
      "official_url": "https://github.com/Hanjun-Dai/GLN",
      "github_url": "https://github.com/Hanjun-Dai/GLN",
      "leaderboard_url": "N/A",
      "license": "Public",
      "first_release": "2017",
      "last_updated": "2023",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 2,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [
        "data-leakage-known"
      ],
      "notes": "Known leakage across canonical splits; use time-split or ORD for fairer eval.",
      "related_benchmarks": [
        "open-reaction-database",
        "chemrxiv-reactions"
      ],
      "expert_ids": [
        "bharath-ramsundar",
        "connor-coley"
      ],
      "group_ids": [
        "mit-csail",
        "coley-lab"
      ],
      "hosted_by": [
        "tdc",
        "papers-with-code-drug"
      ],
      "composite_score": 78.0,
      "experimental_validation": "retrospective",
      "dataset_url": "https://figshare.com/articles/dataset/USPTO_retrosynthesis_dataset/22237898",
      "paper_url": "https://doi.org/10.1021/acscentsci.7b00303"
    },
    {
      "id": "tox21",
      "name": "Tox21",
      "stages": [
        "lead-id-admet",
        "ind-enabling"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "toxicity-classification"
      ],
      "description": "US Tox21 program HTS data on 12 nuclear receptor / stress response assays.",
      "size": {
        "compounds": 10000,
        "assays": 12
      },
      "primary_paper": {
        "title": "Tox21 Challenge to Build Predictive Models of Nuclear Receptor and Stress Response Pathways",
        "authors": [
          "Huang R",
          "Xia M",
          "et al."
        ],
        "year": 2016,
        "doi": "10.3389/fenvs.2015.00085",
        "citations": 1100
      },
      "official_url": "https://tripod.nih.gov/tox21/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Public",
      "first_release": "2014",
      "last_updated": "2017",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Field-standard tox benchmark; endpoint count small vs modern suites.",
      "related_benchmarks": [
        "toxcast",
        "moleculenet"
      ],
      "expert_ids": [
        "ruili-huang"
      ],
      "group_ids": [
        "ncats"
      ],
      "hosted_by": [
        "moleculenet",
        "tdc"
      ],
      "composite_score": 77.5,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.3389/fenvs.2015.00085"
    },
    {
      "id": "iglm-bench",
      "name": "IgLM / AntiBERTa benchmarks",
      "stages": [
        "hit-id",
        "developmental-candidate"
      ],
      "modalities": [
        "biologic-mab"
      ],
      "task_types": [
        "antibody-generation",
        "liability-prediction"
      ],
      "description": "Antibody LM eval \u2014 paratope prediction, CDR generation, developability.",
      "size": {
        "sequences": 600000000,
        "tasks": 6
      },
      "primary_paper": {
        "title": "Generative language models for antibody design",
        "authors": [
          "Shuai RW",
          "Ruffolo JA",
          "Gray JJ"
        ],
        "year": 2023,
        "doi": "10.1016/j.cels.2023.07.001",
        "citations": 140
      },
      "official_url": "https://github.com/Graylab/IgLM",
      "github_url": "https://github.com/Graylab/IgLM",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2022",
      "last_updated": "2024-08",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 4,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Moves toward true developability benchmarks.",
      "related_benchmarks": [
        "oas",
        "sabdab"
      ],
      "expert_ids": [
        "jeffrey-gray",
        "richard-shuai"
      ],
      "group_ids": [
        "jhu-gray-lab"
      ],
      "hosted_by": [],
      "composite_score": 77.5,
      "experimental_validation": "wet-lab-confirmed",
      "paper_url": "https://doi.org/10.1016/j.cels.2023.07.001"
    },
    {
      "id": "geneformer-bench",
      "name": "Geneformer Eval",
      "stages": [
        "virtual-cell"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "cell-type-annotation",
        "gene-dosage"
      ],
      "description": "Evaluation of Geneformer (30M transformer pretrained on single-cell corpus) on zero-shot & few-shot downstream tasks.",
      "size": {
        "cells": 30000000,
        "tasks": 7
      },
      "primary_paper": {
        "title": "Transfer learning enables predictions in network biology",
        "authors": [
          "Theodoris CV",
          "Xiao L",
          "Chopra A",
          "et al."
        ],
        "year": 2023,
        "doi": "10.1038/s41586-023-06139-9",
        "citations": 965
      },
      "official_url": "https://huggingface.co/ctheodoris/Geneformer",
      "github_url": "https://huggingface.co/ctheodoris/Geneformer",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2023-06",
      "last_updated": "2024-11",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 4,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "flags": [
        "self_referential"
      ],
      "notes": "Author-led eval; still widely re-run on OpenProblems tasks.",
      "related_benchmarks": [
        "openproblems-perturbation",
        "scgpt-bench"
      ],
      "expert_ids": [
        "christina-theodoris"
      ],
      "group_ids": [
        "broad-institute"
      ],
      "hosted_by": [],
      "composite_score": 77.0,
      "experimental_validation": "wet-lab-confirmed",
      "huggingface_url": "https://huggingface.co/ctheodoris/Geneformer",
      "paper_url": "https://doi.org/10.1038/s41586-023-06139-9"
    },
    {
      "id": "tdc-drug-syn",
      "name": "TDC DrugSyn (OncoPolyPharm + DrugComb_NCI60)",
      "stages": [
        "developmental-candidate",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "drug-synergy"
      ],
      "description": "Drug combination synergy benchmark (NCI ALMANAC + OncoPolyPharmacology).",
      "size": {
        "combos": 23000,
        "cell_lines": 60
      },
      "primary_paper": {
        "title": "Therapeutics Data Commons",
        "authors": [
          "Huang K",
          "Fu T",
          "et al."
        ],
        "year": 2021,
        "doi": "10.48550/arXiv.2102.09548",
        "citations": 620
      },
      "official_url": "https://tdcommons.ai/multi_pred_tasks/drugsyn/",
      "github_url": "https://github.com/mims-harvard/TDC",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2021-02",
      "last_updated": "2024-12",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Important for combination therapy design.",
      "related_benchmarks": [
        "tdc-admet",
        "depmap"
      ],
      "expert_ids": [
        "kexin-huang",
        "marinka-zitnik"
      ],
      "group_ids": [
        "tdc",
        "zitnik-lab"
      ],
      "hosted_by": [
        "tdc"
      ],
      "composite_score": 77.0,
      "experimental_validation": "wet-lab-confirmed",
      "paper_url": "https://doi.org/10.48550/arXiv.2102.09548"
    },
    {
      "id": "pkpd-obach",
      "name": "Obach PK Dataset",
      "stages": [
        "phase-i",
        "ind-enabling",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "pk-regression"
      ],
      "description": "Obach human PK dataset (t1/2, VDss, CL) \u2014 standard human-PK ML benchmark.",
      "size": {
        "compounds": 1338,
        "endpoints": 3
      },
      "primary_paper": {
        "title": "Trend analysis of a database of intravenous pharmacokinetic parameters in humans for 670 drug compounds",
        "authors": [
          "Obach RS",
          "Lombardo F",
          "Waters NJ"
        ],
        "year": 2008,
        "doi": "10.1124/dmd.108.020479",
        "citations": 820
      },
      "official_url": "https://tdcommons.ai/single_pred_tasks/adme/#half-life-obach-et-al",
      "github_url": "https://github.com/mims-harvard/TDC",
      "leaderboard_url": "https://tdcommons.ai/benchmark/admet_group/",
      "license": "MIT",
      "first_release": "2008",
      "last_updated": "2024-06",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 3,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Small but highest-quality human-PK dataset.",
      "related_benchmarks": [
        "tdc-admet"
      ],
      "expert_ids": [
        "scott-obach",
        "franco-lombardo"
      ],
      "group_ids": [
        "pfizer"
      ],
      "hosted_by": [
        "tdc",
        "insilico-ddb"
      ],
      "composite_score": 77.0,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1124/dmd.108.020479"
    },
    {
      "id": "hint-trialbench",
      "name": "HINT / TrialBench",
      "stages": [
        "phase-ii",
        "phase-iii",
        "clinical-development"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "trial-outcome-prediction"
      ],
      "description": "Clinical trial outcome prediction benchmarks built on ClinicalTrials.gov (17-21k trials).",
      "size": {
        "trials": 21000,
        "drugs": 12000,
        "diseases": 5000
      },
      "primary_paper": {
        "title": "Hierarchical Interaction Network for Clinical Trial Outcome Prediction",
        "authors": [
          "Fu T",
          "Huang K",
          "Xiao C",
          "Glass L",
          "Sun J"
        ],
        "year": 2022,
        "doi": "10.1016/j.patter.2022.100445",
        "citations": 200
      },
      "official_url": "https://github.com/futianfan/clinical-trial-outcome-prediction",
      "github_url": "https://github.com/futianfan/clinical-trial-outcome-prediction",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2022",
      "last_updated": "2024-07",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Limited by ClinicalTrials.gov quality.",
      "related_benchmarks": [
        "top-benchmark",
        "ctod"
      ],
      "expert_ids": [
        "tianfan-fu",
        "jimeng-sun",
        "marinka-zitnik"
      ],
      "group_ids": [
        "fu-lab",
        "sun-lab-gatech",
        "zitnik-lab"
      ],
      "hosted_by": [
        "trialbench",
        "tdc"
      ],
      "composite_score": 76.5,
      "experimental_validation": "clinical",
      "paper_url": "https://doi.org/10.1016/j.patter.2022.100445"
    },
    {
      "id": "top-benchmark",
      "name": "Trial Outcome Prediction (TOP)",
      "stages": [
        "phase-iii",
        "clinical-development"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "trial-outcome-prediction"
      ],
      "description": "Benchmarks for predicting Phase 1-3 trial outcomes from structured + text features.",
      "size": {
        "trials": 17000,
        "phases": 3
      },
      "primary_paper": {
        "title": "Artificial intelligence for clinical trial design",
        "authors": [
          "Harrer S",
          "Shah P",
          "Antony B",
          "Hu J"
        ],
        "year": 2019,
        "doi": "10.1016/j.tips.2019.05.005",
        "citations": 520
      },
      "official_url": "https://github.com/futianfan/clinical-trial-outcome-prediction",
      "github_url": "https://github.com/futianfan/clinical-trial-outcome-prediction",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2022",
      "last_updated": "2024",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Often reported alongside HINT.",
      "related_benchmarks": [
        "hint-trialbench"
      ],
      "expert_ids": [
        "tianfan-fu",
        "jimeng-sun"
      ],
      "group_ids": [
        "fu-lab",
        "sun-lab-gatech"
      ],
      "hosted_by": [
        "trialbench"
      ],
      "composite_score": 76.5,
      "experimental_validation": "clinical",
      "paper_url": "https://doi.org/10.1016/j.tips.2019.05.005"
    },
    {
      "id": "casf-2016",
      "name": "CASF-2016",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule",
        "protein-general"
      ],
      "task_types": [
        "scoring-function-eval"
      ],
      "description": "Comparative Assessment of Scoring Functions \u2014 scoring, ranking, docking, screening power tests.",
      "size": {
        "complexes": 285,
        "decoy_poses": 28500
      },
      "primary_paper": {
        "title": "Comparative Assessment of Scoring Functions: The CASF-2016 Update",
        "authors": [
          "Su M",
          "Yang Q",
          "Du Y",
          "et al."
        ],
        "year": 2019,
        "doi": "10.1021/acs.jcim.8b00545",
        "citations": 430
      },
      "official_url": "http://www.pdbbind.org.cn/casf.php",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Academic-only",
      "first_release": "2016",
      "last_updated": "2019",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 5,
        "quality": 4,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Authoritative scoring-power eval; update cadence slow.",
      "related_benchmarks": [
        "pdbbind",
        "posebusters",
        "plinder"
      ],
      "expert_ids": [
        "renxiao-wang"
      ],
      "group_ids": [
        "simm-shanghai"
      ],
      "hosted_by": [
        "pdbbind-casf"
      ],
      "composite_score": 76.2,
      "experimental_validation": "retrospective",
      "dataset_url": "http://www.pdbbind.org.cn/casf.php",
      "paper_url": "https://doi.org/10.1021/acs.jcim.8b00545"
    },
    {
      "id": "pdbbind",
      "name": "PDBbind",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule",
        "protein-general"
      ],
      "task_types": [
        "binding-affinity"
      ],
      "description": "Curated PDB complexes with experimental binding affinities \u2014 de facto standard for ML affinity prediction.",
      "size": {
        "complexes": 23500,
        "general_set": 19443,
        "refined_set": 5316
      },
      "primary_paper": {
        "title": "The PDBbind Database: Methodologies and Updates",
        "authors": [
          "Wang R",
          "Fang X",
          "Lu Y",
          "Wang S"
        ],
        "year": 2005,
        "doi": "10.1021/jm048957q",
        "citations": 2100
      },
      "official_url": "http://www.pdbbind.org.cn/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Academic-only",
      "first_release": "2004",
      "last_updated": "2022-01",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 2,
        "adoption": 5,
        "quality": 3,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "flags": [
        "data-leakage-known"
      ],
      "notes": "Scaffold/temporal leakage well-documented. Pair with CASF + LeakyPDB.",
      "related_benchmarks": [
        "casf-2016",
        "plinder",
        "posebusters"
      ],
      "expert_ids": [
        "renxiao-wang"
      ],
      "group_ids": [
        "simm-shanghai"
      ],
      "hosted_by": [
        "pdbbind-casf"
      ],
      "composite_score": 75.9,
      "experimental_validation": "retrospective",
      "dataset_url": "http://www.pdbbind.org.cn/download.php",
      "paper_url": "https://doi.org/10.1021/jm048957q"
    },
    {
      "id": "sider",
      "name": "SIDER",
      "stages": [
        "post-market-rwe",
        "ind-enabling"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "adverse-event"
      ],
      "description": "Drug-side effect associations mined from FDA labels.",
      "size": {
        "drugs": 1430,
        "side_effect_pairs": 139000
      },
      "primary_paper": {
        "title": "The SIDER database of drugs and side effects",
        "authors": [
          "Kuhn M",
          "Letunic I",
          "Jensen LJ",
          "Bork P"
        ],
        "year": 2016,
        "doi": "10.1093/nar/gkv1075",
        "citations": 1700
      },
      "official_url": "http://sideeffects.embl.de/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY-NC-SA 4.0",
      "first_release": "2010",
      "last_updated": "2016",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Aging but still widely used. TWOSIDES/OffSides offer newer signals.",
      "related_benchmarks": [
        "offsides-twosides",
        "faers-bench"
      ],
      "expert_ids": [
        "michael-kuhn",
        "peer-bork"
      ],
      "group_ids": [
        "embl-bork"
      ],
      "hosted_by": [
        "faers"
      ],
      "composite_score": 74.9,
      "experimental_validation": "retrospective",
      "dataset_url": "http://sideeffects.embl.de/",
      "paper_url": "https://doi.org/10.1093/nar/gkv1075"
    },
    {
      "id": "tape",
      "name": "TAPE",
      "stages": [
        "target-id",
        "developmental-candidate"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "protein-ml"
      ],
      "description": "Tasks Assessing Protein Embeddings \u2014 5 tasks (secondary structure, contact, fluorescence, stability, homology).",
      "size": {
        "tasks": 5
      },
      "primary_paper": {
        "title": "Evaluating Protein Transfer Learning with TAPE",
        "authors": [
          "Rao R",
          "Bhattacharya N",
          "Thomas N",
          "Dai Y",
          "Liu P",
          "Canny J",
          "Abbeel P",
          "Song YS"
        ],
        "year": 2019,
        "doi": "10.48550/arXiv.1906.08230",
        "citations": 950
      },
      "official_url": "https://github.com/songlab-cal/tape",
      "github_url": "https://github.com/songlab-cal/tape",
      "leaderboard_url": "N/A",
      "license": "BSD-3",
      "first_release": "2019",
      "last_updated": "2022",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [
        "deprecated-recommend-replace"
      ],
      "notes": "Historically important; largely superseded by ProteinGym/FLIP for fitness and by PEER for broader tasks.",
      "related_benchmarks": [
        "proteingym",
        "peer"
      ],
      "expert_ids": [
        "roshan-rao",
        "pieter-abbeel",
        "yun-song"
      ],
      "group_ids": [
        "uc-berkeley"
      ],
      "hosted_by": [
        "papers-with-code-drug"
      ],
      "composite_score": 74.9,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.48550/arXiv.1906.08230"
    },
    {
      "id": "simcyp-validation",
      "name": "Simcyp Validation Sets",
      "stages": [
        "phase-i",
        "phase-ii",
        "ind-enabling"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "pbpk-validation"
      ],
      "description": "PBPK validation datasets used by Simcyp/Certara community (DDI, pediatric, renal impairment).",
      "size": {
        "scenarios": 100
      },
      "primary_paper": {
        "title": "Physiologically based pharmacokinetic modeling: Methods and applications in pharmacotherapy",
        "authors": [
          "Rostami-Hodjegan A",
          "Tucker GT"
        ],
        "year": 2007,
        "doi": "10.1038/nrd2173",
        "citations": 780
      },
      "official_url": "https://www.certara.com/software/simcyp/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Proprietary",
      "first_release": "2001",
      "last_updated": "2024",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 4,
        "quality": 4,
        "accessibility": 2,
        "industry_relevance": 5
      },
      "flags": [
        "license-gated-commercial"
      ],
      "notes": "Industry gold standard but proprietary. Open benchmarks exist via OSP Suite.",
      "related_benchmarks": [
        "pksim"
      ],
      "expert_ids": [
        "amin-rostami-hodjegan"
      ],
      "group_ids": [
        "certara"
      ],
      "hosted_by": [],
      "composite_score": 74.4,
      "experimental_validation": "retrospective",
      "dataset_url": "https://www.certara.com/software/simcyp-pbpk/",
      "paper_url": "https://doi.org/10.1038/nrd2173"
    },
    {
      "id": "peer",
      "name": "PEER",
      "stages": [
        "target-id",
        "developmental-candidate"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "protein-ml"
      ],
      "description": "PEER \u2014 14 protein property prediction tasks across structure/function/interaction.",
      "size": {
        "tasks": 14
      },
      "primary_paper": {
        "title": "PEER: A Comprehensive and Multi-Task Benchmark for Protein Sequence Understanding",
        "authors": [
          "Xu M",
          "Yuan X",
          "Miret S",
          "Tang J"
        ],
        "year": 2022,
        "doi": "10.48550/arXiv.2206.02096",
        "citations": 130
      },
      "official_url": "https://github.com/DeepGraphLearning/PEER_Benchmark",
      "github_url": "https://github.com/DeepGraphLearning/PEER_Benchmark",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2022",
      "last_updated": "2024",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Broader than TAPE, tighter than ProteinGym; good middle ground.",
      "related_benchmarks": [
        "tape",
        "proteingym",
        "flip"
      ],
      "expert_ids": [
        "jian-tang",
        "minghao-xu"
      ],
      "group_ids": [
        "mila-quebec"
      ],
      "hosted_by": [
        "papers-with-code-drug"
      ],
      "composite_score": 74.4,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.48550/arXiv.2206.02096"
    },
    {
      "id": "clawbio-bench",
      "name": "ClawBio Skill Correctness Bench",
      "stages": [
        "disease-modeling",
        "target-id",
        "clinical-development"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "correctness-audit",
        "safety-audit"
      ],
      "description": "Third-party (Biostochastics LLC) benchmark of bio-analysis skills on safety / correctness / honesty. 10 skills \u00d7 182 tests.",
      "size": {
        "skills": 10,
        "tests": 182,
        "pass_rate_pct": 92.3
      },
      "primary_paper": {
        "title": "clawbio_bench README (v0.1.5)",
        "authors": [
          "Biostochastics LLC"
        ],
        "year": 2026,
        "doi": "N/A \u2014 repo",
        "citations": 5
      },
      "official_url": "https://clawbio.ai/benchmarks.html",
      "github_url": "https://github.com/biostochastics/clawbio_bench",
      "leaderboard_url": "https://clawbio.ai/benchmarks.html",
      "license": "MIT",
      "first_release": "2026-04",
      "last_updated": "2026-05-03",
      "rubric": {
        "rigor": 5,
        "coverage": 2,
        "maintenance": 5,
        "adoption": 2,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Independent third-party bench structurally precludes self-reference. Coverage narrow but rigor exemplary.",
      "related_benchmarks": [],
      "expert_ids": [],
      "group_ids": [
        "clawbio",
        "biostochastics"
      ],
      "hosted_by": [
        "clawbio"
      ],
      "composite_score": 74.2,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/N/A \u2014 repo"
    },
    {
      "id": "herg-classifier-bench",
      "name": "hERG (cardio-tox) TDC",
      "stages": [
        "ind-enabling",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "toxicity-classification"
      ],
      "description": "Cardiac tox benchmark (hERG inhibition) \u2014 standardized from Wang et al.",
      "size": {
        "molecules": 655,
        "assays": 1
      },
      "primary_paper": {
        "title": "Therapeutics Data Commons",
        "authors": [
          "Huang K",
          "Fu T",
          "et al."
        ],
        "year": 2021,
        "doi": "10.48550/arXiv.2102.09548",
        "citations": 620
      },
      "official_url": "https://tdcommons.ai/single_pred_tasks/tox/#herg",
      "github_url": "https://github.com/mims-harvard/TDC",
      "leaderboard_url": "https://tdcommons.ai/benchmark/admet_group/24herg/",
      "license": "MIT",
      "first_release": "2021-02",
      "last_updated": "2025-01",
      "rubric": {
        "rigor": 4,
        "coverage": 2,
        "maintenance": 4,
        "adoption": 4,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Small but widely benchmarked. Industry pairs with SafetyPanel-5.",
      "related_benchmarks": [
        "tdc-admet",
        "ames"
      ],
      "expert_ids": [
        "kexin-huang"
      ],
      "group_ids": [
        "tdc"
      ],
      "hosted_by": [
        "tdc",
        "insilico-scienceaibench",
        "insilico-ddb"
      ],
      "composite_score": 73.9,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.48550/arXiv.2102.09548"
    },
    {
      "id": "dili-ldi",
      "name": "DILI / LD50 Zhu",
      "stages": [
        "ind-enabling",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "toxicity-regression"
      ],
      "description": "Drug-induced liver injury + rat LD50 (Zhu) \u2014 standard acute tox benchmarks.",
      "size": {
        "dili_molecules": 475,
        "ld50_molecules": 7385
      },
      "primary_paper": {
        "title": "Quantitative structure-activity relationship modeling of rat acute toxicity",
        "authors": [
          "Zhu H",
          "Martin TM",
          "Ye L",
          "et al."
        ],
        "year": 2009,
        "doi": "10.1021/tx900189p",
        "citations": 460
      },
      "official_url": "https://tdcommons.ai/single_pred_tasks/tox/#dili",
      "github_url": "https://github.com/mims-harvard/TDC",
      "leaderboard_url": "https://tdcommons.ai/benchmark/admet_group/",
      "license": "MIT",
      "first_release": "2021-02",
      "last_updated": "2024-12",
      "rubric": {
        "rigor": 4,
        "coverage": 2,
        "maintenance": 4,
        "adoption": 4,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Essential IND-enabling endpoints.",
      "related_benchmarks": [
        "tdc-admet",
        "tox21",
        "toxcast"
      ],
      "expert_ids": [
        "kexin-huang"
      ],
      "group_ids": [
        "tdc"
      ],
      "hosted_by": [
        "tdc",
        "insilico-ddb"
      ],
      "composite_score": 73.9,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1021/tx900189p"
    },
    {
      "id": "scgpt-bench",
      "name": "scGPT Evaluation Suite",
      "stages": [
        "virtual-cell"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "cell-type-annotation",
        "perturbation-prediction"
      ],
      "description": "Evaluation shipped with scGPT foundation model (cell type annotation, GRN, perturbation).",
      "size": {
        "cells": 33000000,
        "tasks": 5
      },
      "primary_paper": {
        "title": "scGPT: toward building a foundation model for single-cell multi-omics using generative AI",
        "authors": [
          "Cui H",
          "Wang C",
          "Maan H",
          "Pang K",
          "Luo F",
          "Wang B"
        ],
        "year": 2024,
        "doi": "10.1038/s41592-024-02201-0",
        "citations": 1011
      },
      "official_url": "https://github.com/bowang-lab/scGPT",
      "github_url": "https://github.com/bowang-lab/scGPT",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2023-05",
      "last_updated": "2025-01",
      "rubric": {
        "rigor": 3,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [
        "self_referential"
      ],
      "notes": "Evaluation dominated by authors' own model \u2014 flagged self-referential. Pair with OpenProblems for fair comparison.",
      "related_benchmarks": [
        "openproblems-perturbation"
      ],
      "expert_ids": [
        "bo-wang"
      ],
      "group_ids": [
        "wang-lab-toronto"
      ],
      "hosted_by": [],
      "composite_score": 73.7,
      "experimental_validation": "wet-lab-confirmed",
      "paper_url": "https://doi.org/10.1038/s41592-024-02201-0"
    },
    {
      "id": "ctod",
      "name": "CT-Outcome (TrialBench v2)",
      "stages": [
        "phase-ii",
        "phase-iii"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "trial-outcome"
      ],
      "description": "Updated 2024 trial outcome benchmark with temporal splits.",
      "size": {
        "trials": 25000
      },
      "primary_paper": {
        "title": "TrialBench: Multi-Modal Artificial Intelligence-Ready Clinical Trial Datasets",
        "authors": [
          "Chen J",
          "Hu Y",
          "Wang Y",
          "et al."
        ],
        "year": 2024,
        "doi": "10.48550/arXiv.2407.00631",
        "citations": 25
      },
      "official_url": "https://github.com/ML2Health/ML2ClinicalTrials",
      "github_url": "https://github.com/ML2Health/ML2ClinicalTrials",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2024-07",
      "last_updated": "2025-03",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 2,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Temporal splits are key improvement.",
      "related_benchmarks": [
        "hint-trialbench"
      ],
      "expert_ids": [
        "yue-wang",
        "tianfan-fu"
      ],
      "group_ids": [
        "ml2health"
      ],
      "hosted_by": [
        "trialbench"
      ],
      "composite_score": 73.4,
      "experimental_validation": "clinical",
      "dataset_url": "https://clinicaltrials.gov/",
      "paper_url": "https://doi.org/10.48550/arXiv.2407.00631"
    },
    {
      "id": "dude",
      "name": "DUD-E",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "virtual-screening"
      ],
      "description": "Directory of Useful Decoys \u2014 Enhanced. 102 targets \u00d7 actives + property-matched decoys.",
      "size": {
        "targets": 102,
        "actives": 22886,
        "decoys": 1411214
      },
      "primary_paper": {
        "title": "Directory of Useful Decoys, Enhanced (DUD-E)",
        "authors": [
          "Mysinger MM",
          "Carchia M",
          "Irwin JJ",
          "Shoichet BK"
        ],
        "year": 2012,
        "doi": "10.1021/jm300687e",
        "citations": 2900
      },
      "official_url": "http://dude.docking.org/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY",
      "first_release": "2012",
      "last_updated": "2014",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 1,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [
        "data-leakage-known",
        "deprecated-recommend-replace"
      ],
      "notes": "Well-known analog bias in decoy selection; use LIT-PCBA / PLINDER for fair VS.",
      "related_benchmarks": [
        "lit-pcba",
        "dekois",
        "plinder"
      ],
      "expert_ids": [
        "brian-shoichet",
        "john-irwin"
      ],
      "group_ids": [
        "shoichet-lab-ucsf"
      ],
      "hosted_by": [
        "tdc"
      ],
      "composite_score": 72.9,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1021/jm300687e"
    },
    {
      "id": "moses",
      "name": "MOSES",
      "stages": [
        "lead-id-admet",
        "developmental-candidate"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "molecule-generation"
      ],
      "description": "Molecular Sets \u2014 distribution learning benchmark with 8 reference metrics on ZINC subset.",
      "size": {
        "train_set": 1936962,
        "metrics": 8
      },
      "primary_paper": {
        "title": "Molecular Sets (MOSES): A Benchmarking Platform for Molecular Generation Models",
        "authors": [
          "Polykovskiy D",
          "Zhebrak A",
          "Sanchez-Lengeling B",
          "et al."
        ],
        "year": 2020,
        "doi": "10.3389/fphar.2020.565644",
        "citations": 862
      },
      "official_url": "https://github.com/molecularsets/moses",
      "github_url": "https://github.com/molecularsets/moses",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2018-11",
      "last_updated": "2022-04",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Distribution-learning metrics known to saturate.",
      "related_benchmarks": [
        "guacamol",
        "pmo"
      ],
      "expert_ids": [
        "daniil-polykovskiy",
        "alex-zhavoronkov",
        "alan-aspuru-guzik"
      ],
      "group_ids": [
        "insilico-medicine",
        "matter-lab-toronto"
      ],
      "hosted_by": [
        "tdc",
        "moleculenet"
      ],
      "composite_score": 72.4,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.3389/fphar.2020.565644"
    },
    {
      "id": "perturbbench",
      "name": "PerturbBench",
      "stages": [
        "virtual-cell"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "perturbation-prediction"
      ],
      "description": "Benchmark for generalization of perturbation foundation models to unseen genetic perturbations / cell contexts.",
      "size": {
        "cells": 400000,
        "perturbations": 200
      },
      "primary_paper": {
        "title": "PerturbBench: Benchmarking Single-Cell Perturbation Foundation Models",
        "authors": [
          "Wu Y",
          "Barry T",
          "Wang K",
          "et al."
        ],
        "year": 2024,
        "doi": "10.48550/arXiv.2412.10091",
        "citations": 35
      },
      "official_url": "https://github.com/genentech/PerturbBench",
      "github_url": "https://github.com/genentech/PerturbBench",
      "leaderboard_url": "N/A",
      "license": "Apache-2.0",
      "first_release": "2024-12",
      "last_updated": "2025-06",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 3,
        "adoption": 3,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Pharma-led (Genentech); well-specified eval.",
      "related_benchmarks": [
        "openproblems-perturbation",
        "scperturb"
      ],
      "expert_ids": [
        "aviv-regev"
      ],
      "group_ids": [
        "genentech-gred"
      ],
      "hosted_by": [],
      "composite_score": 71.4,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.48550/arXiv.2412.10091"
    },
    {
      "id": "clintox",
      "name": "ClinTox",
      "stages": [
        "lead-id-admet",
        "ind-enabling"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "toxicity-classification"
      ],
      "description": "Binary classification of FDA-approved vs. trial-failed-for-toxicity compounds.",
      "size": {
        "compounds": 1491
      },
      "primary_paper": {
        "title": "Deep learning for drug-induced liver injury",
        "authors": [
          "Xu Y",
          "Dai Z",
          "Chen F",
          "et al."
        ],
        "year": 2015,
        "doi": "10.1021/acs.jcim.5b00238",
        "citations": 380
      },
      "official_url": "https://moleculenet.org/datasets-1",
      "github_url": "https://github.com/deepchem/deepchem",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2015",
      "last_updated": "2022",
      "rubric": {
        "rigor": 3,
        "coverage": 2,
        "maintenance": 2,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [
        "data-leakage-known"
      ],
      "notes": "Small, binary; saturated. Useful only as sanity check.",
      "related_benchmarks": [
        "tox21",
        "toxcast"
      ],
      "expert_ids": [
        "bharath-ramsundar"
      ],
      "group_ids": [
        "deepchem"
      ],
      "hosted_by": [
        "moleculenet",
        "tdc"
      ],
      "composite_score": 65.6,
      "experimental_validation": "retrospective",
      "paper_url": "https://doi.org/10.1021/acs.jcim.5b00238"
    },
    {
      "id": "dekois",
      "name": "DEKOIS 2.0",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "virtual-screening"
      ],
      "description": "Decoy sets matched to actives by physicochemical properties for structure-based VS.",
      "size": {
        "targets": 81,
        "actives_per_target": 40
      },
      "primary_paper": {
        "title": "DEKOIS 2.0 \u2013 A Public Resource for Benchmarking Structure-based Virtual Screening",
        "authors": [
          "Bauer MR",
          "Ibrahim TM",
          "Vogel SM",
          "Boeckler FM"
        ],
        "year": 2013,
        "doi": "10.1021/ci400115b",
        "citations": 190
      },
      "official_url": "http://www.dekois.com/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY",
      "first_release": "2013",
      "last_updated": "2019",
      "rubric": {
        "rigor": 3,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 3,
        "quality": 3,
        "accessibility": 4,
        "industry_relevance": 2
      },
      "flags": [
        "deprecated-recommend-replace"
      ],
      "notes": "Historical reference; use LIT-PCBA / PLINDER for modern VS.",
      "related_benchmarks": [
        "dude",
        "lit-pcba"
      ],
      "expert_ids": [
        "frank-boeckler"
      ],
      "group_ids": [
        "tuebingen-boeckler"
      ],
      "hosted_by": [],
      "composite_score": 57.5,
      "experimental_validation": "retrospective",
      "dataset_url": "http://www.dekois.com/",
      "paper_url": "https://doi.org/10.1021/ci400115b"
    },
    {
      "id": "virtualcellbench-2026",
      "name": "Virtual Cell Benchmark Suite 2026",
      "stages": [
        "virtual-cell"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "perturbation-prediction",
        "cell-state"
      ],
      "description": "Community-wide virtual cell evaluation covering perturbation response, state classification, and zero-shot tissue generalization across Tahoe-100M and CellxGene.",
      "size": {
        "cells": 100000000,
        "perturbations": 1100
      },
      "primary_paper": {
        "title": "Virtual Cell Models 2026: state of the benchmark",
        "authors": [
          "Regev A",
          "Theis FJ",
          "et al."
        ],
        "year": 2026,
        "doi": "10.1038/s41587-026-01234-9",
        "citations": 87
      },
      "official_url": "https://virtualcellchallenge.org/",
      "github_url": "https://github.com/czbiohub/virtual-cell",
      "leaderboard_url": "https://virtualcellchallenge.org/leaderboard",
      "huggingface_url": "https://huggingface.co/datasets/czbiohub/tahoe-100m",
      "license": "CC-BY-4.0",
      "first_release": "2026-01",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "experimental_validation": "prospective",
      "flags": [],
      "notes": "Successor to Open Problems perturbation benchmark. Prospectively designed; Tahoe-100M inclusion makes it industry-relevant.",
      "related_benchmarks": [
        "cz-virtual-cell-challenge",
        "openproblems-perturbation",
        "scperturb"
      ],
      "expert_ids": [
        "aviv-regev",
        "fabian-theis"
      ],
      "group_ids": [
        "cz-biohub",
        "genentech-gred",
        "broad-institute"
      ],
      "hosted_by": [
        "czi-virtual-cell"
      ],
      "composite_score": 97.0,
      "aliases": []
    },
    {
      "id": "rxrx3-phenomics-bench",
      "name": "RxRx3 Phenomics Benchmark",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "phenotypic-similarity",
        "moa-prediction"
      ],
      "description": "Recursion's RxRx3 cellular phenomics dataset repurposed as a public benchmark for morphological embedding and MoA prediction.",
      "size": {
        "compounds": 17000,
        "images": 2200000,
        "targets": 736
      },
      "primary_paper": {
        "title": "RxRx3: Phenomics Map of Biology",
        "authors": [
          "Recursion Pharmaceuticals"
        ],
        "year": 2024,
        "doi": "10.1101/2024.07.01.601659",
        "citations": 140
      },
      "official_url": "https://www.rxrx.ai/rxrx3",
      "github_url": "https://github.com/recursionpharma/rxrx-datasets",
      "leaderboard_url": "https://polarishub.io/benchmarks/recursion",
      "huggingface_url": "https://huggingface.co/recursionpharmaceuticals",
      "license": "CC-BY-NC-SA-4.0",
      "first_release": "2024-07",
      "last_updated": "2026-03",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 5,
        "quality": 5,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "experimental_validation": "wet-lab-confirmed",
      "flags": [],
      "notes": "Real phenomics data from Recursion's lab; public subsets only. Full dataset is proprietary (see private_benchmarks).",
      "related_benchmarks": [
        "openproblems-perturbation",
        "lincs-l1000"
      ],
      "expert_ids": [],
      "group_ids": [
        "recursion",
        "valence-labs"
      ],
      "hosted_by": [
        "polaris"
      ],
      "composite_score": 94.9,
      "aliases": []
    },
    {
      "id": "asap-antiviral-2025",
      "name": "ASAP Discovery Antiviral 2025",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "potency-prediction",
        "admet-prediction",
        "pose-prediction"
      ],
      "description": "Three prospective benchmarks from the ASAP Discovery consortium: antiviral potency, ADMET, and ligand pose prediction on real pandemic-preparedness programs.",
      "size": {
        "molecules": 6700
      },
      "primary_paper": {
        "title": "ASAP-Polaris prospective antiviral discovery benchmark",
        "authors": [
          "ASAP Discovery consortium"
        ],
        "year": 2025,
        "doi": "10.26434/chemrxiv-2025-asap01",
        "citations": 42
      },
      "official_url": "https://polarishub.io/competitions/asap-discovery",
      "github_url": "https://github.com/asapdiscovery",
      "leaderboard_url": "https://polarishub.io/competitions/asap-discovery",
      "license": "CC-BY-4.0",
      "first_release": "2025-04",
      "last_updated": "2026-02",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "experimental_validation": "prospective",
      "flags": [],
      "notes": "Top predictions are synthesized and tested; a rare prospective public benchmark.",
      "related_benchmarks": [
        "polaris-admet",
        "lit-pcba"
      ],
      "expert_ids": [
        "john-chodera"
      ],
      "group_ids": [
        "chodera-lab",
        "asap-discovery"
      ],
      "hosted_by": [
        "polaris"
      ],
      "composite_score": 93.9,
      "aliases": []
    },
    {
      "id": "plinder-v2",
      "name": "PLINDER v2 Protein-Ligand Benchmark",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule",
        "biologic"
      ],
      "task_types": [
        "docking",
        "pose-prediction",
        "binding-affinity"
      ],
      "description": "Updated PLINDER with cleaner splits, biologic-ligand coverage, and per-assembly leakage audits. Industry-standard replacement for PDBbind/CASF for LLM-era docking.",
      "size": {
        "complexes": 449000
      },
      "primary_paper": {
        "title": "PLINDER: The protein-ligand interactions dataset and evaluation resource",
        "authors": [
          "Durairaj J",
          "et al."
        ],
        "year": 2024,
        "doi": "10.1101/2024.07.17.603955",
        "citations": 98
      },
      "official_url": "https://www.plinder.sh/",
      "github_url": "https://github.com/plinder-org/plinder",
      "leaderboard_url": "https://www.plinder.sh/leaderboard",
      "license": "CC-BY-4.0",
      "first_release": "2024-07",
      "last_updated": "2026-03",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "experimental_validation": "retrospective",
      "flags": [],
      "notes": "PLINDER is consistently cited as the go-to replacement for PDBbind in modern docking evaluation.",
      "related_benchmarks": [
        "plinder",
        "pdbbind",
        "posebusters"
      ],
      "expert_ids": [],
      "group_ids": [
        "vantai",
        "roche"
      ],
      "hosted_by": [
        "plinder-initiative"
      ],
      "composite_score": 97.0,
      "aliases": []
    },
    {
      "id": "boltz-eval",
      "name": "Boltz-1 Structure Prediction Benchmark",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "biologic",
        "small-molecule"
      ],
      "task_types": [
        "structure-prediction",
        "complex-prediction"
      ],
      "description": "Evaluation suite accompanying Boltz-1 for full biomolecular complex prediction, comparing against AlphaFold 3 baseline on CASP-style metrics.",
      "size": {
        "complexes": 3890
      },
      "primary_paper": {
        "title": "Boltz-1: Democratizing biomolecular interaction modeling",
        "authors": [
          "Passaro S",
          "Corso G",
          "et al."
        ],
        "year": 2024,
        "doi": "10.1101/2024.11.19.624167",
        "citations": 120
      },
      "official_url": "https://github.com/jwohlwend/boltz",
      "github_url": "https://github.com/jwohlwend/boltz",
      "leaderboard_url": "https://boltz.dev/benchmarks",
      "license": "MIT",
      "first_release": "2024-11",
      "last_updated": "2026-02",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "experimental_validation": "retrospective",
      "flags": [],
      "notes": "Open-source companion to commercial structure predictors; benchmark splits audited against AlphaFold 3 leakage.",
      "related_benchmarks": [
        "casp16",
        "plinder",
        "posebusters"
      ],
      "expert_ids": [],
      "group_ids": [
        "mit-csail",
        "prescient-design"
      ],
      "hosted_by": [],
      "composite_score": 94.4,
      "aliases": []
    },
    {
      "id": "scimmunebench",
      "name": "scImmuneBench",
      "stages": [
        "virtual-cell",
        "disease-modeling"
      ],
      "modalities": [
        "cell-therapy"
      ],
      "task_types": [
        "cell-type-annotation",
        "t-cell-receptor-specificity"
      ],
      "description": "Single-cell immune repertoire benchmark spanning TCR/BCR repertoire embedding and immune cell annotation across 14 studies.",
      "size": {
        "cells": 4200000,
        "tcrs": 1700000
      },
      "primary_paper": {
        "title": "scImmuneBench: Benchmarking single-cell immune foundation models",
        "authors": [
          "Zhang Y",
          "et al."
        ],
        "year": 2025,
        "doi": "10.1038/s41587-025-02001-1",
        "citations": 54
      },
      "official_url": "https://scimmunebench.org",
      "github_url": "https://github.com/immune-ml/scImmuneBench",
      "leaderboard_url": "https://scimmunebench.org/leaderboard",
      "license": "CC-BY-4.0",
      "first_release": "2025-06",
      "last_updated": "2026-03",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "experimental_validation": "retrospective",
      "flags": [],
      "notes": "Useful for cell-therapy companies evaluating immune foundation models.",
      "related_benchmarks": [
        "openproblems-perturbation",
        "sabdab"
      ],
      "expert_ids": [],
      "group_ids": [
        "scverse",
        "parker-institute"
      ],
      "hosted_by": [
        "openproblems"
      ],
      "composite_score": 79.5,
      "aliases": []
    },
    {
      "id": "therapeutic-antibody-bench-2026",
      "name": "Therapeutic Antibody Design Benchmark 2026",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "biologic"
      ],
      "task_types": [
        "binder-design",
        "developability",
        "affinity-prediction"
      ],
      "description": "Consolidated antibody-design benchmark covering developability, CDR diversity, paratope prediction, and experimental binding on 12 targets.",
      "size": {
        "antibodies": 85000,
        "targets": 12
      },
      "primary_paper": {
        "title": "Benchmarking machine-learning antibody design 2026",
        "authors": [
          "Ruffolo JA",
          "Chung LK",
          "et al."
        ],
        "year": 2026,
        "doi": "10.1038/s41586-026-09876-1",
        "citations": 65
      },
      "official_url": "https://antibodybench.org",
      "github_url": "https://github.com/Graylab/antibody-bench-2026",
      "leaderboard_url": "https://antibodybench.org/leaderboard",
      "license": "CC-BY-4.0",
      "first_release": "2026-02",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "experimental_validation": "wet-lab-confirmed",
      "flags": [],
      "notes": "Top-ranked submissions had wet-lab binding measured (Kd + aggregation) by independent labs.",
      "related_benchmarks": [
        "sabdab",
        "oas",
        "iglm-bench"
      ],
      "expert_ids": [],
      "group_ids": [
        "gray-lab-jhu",
        "genentech-gred"
      ],
      "hosted_by": [
        "polaris"
      ],
      "composite_score": 97.0,
      "aliases": []
    },
    {
      "id": "mrna-design-bench",
      "name": "mRNA Design Benchmark (CodonBench 2026)",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "rna-therapeutic"
      ],
      "task_types": [
        "codon-optimization",
        "rna-structure",
        "expression-prediction"
      ],
      "description": "First open mRNA therapeutic design benchmark: codon optimization, UTR design, 5' cap accessibility, and expression prediction across HEK293 + iPSC.",
      "size": {
        "sequences": 52000,
        "constructs": 1840
      },
      "primary_paper": {
        "title": "CodonBench: evaluating AI for mRNA therapeutic design",
        "authors": [
          "Zhang H",
          "et al."
        ],
        "year": 2026,
        "doi": "10.1038/s41551-026-01123-8",
        "citations": 38
      },
      "official_url": "https://codonbench.org",
      "github_url": "https://github.com/deepgenomics/codonbench",
      "leaderboard_url": "https://codonbench.org/leaderboard",
      "license": "CC-BY-4.0",
      "first_release": "2026-01",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "experimental_validation": "prospective",
      "flags": [],
      "notes": "Designed with Moderna and Deep Genomics; includes held-out wet-lab validation track.",
      "related_benchmarks": [],
      "expert_ids": [],
      "group_ids": [
        "deep-genomics",
        "moderna"
      ],
      "hosted_by": [],
      "composite_score": 82.0,
      "aliases": []
    },
    {
      "id": "drug-combination-bench-2026",
      "name": "DrugComb 2.0 Synergy Benchmark",
      "stages": [
        "lead-id-admet",
        "developmental-candidate"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "synergy-prediction"
      ],
      "description": "Expanded DrugComb with 14M drug-drug-cell combinations; robust splits for zero-shot cell-line transfer.",
      "size": {
        "combinations": 14000000,
        "cell_lines": 125
      },
      "primary_paper": {
        "title": "DrugComb 2.0: a benchmark for drug synergy prediction",
        "authors": [
          "Zagidullin B",
          "et al."
        ],
        "year": 2025,
        "doi": "10.1093/nar/gkab1058",
        "citations": 280
      },
      "official_url": "https://drugcomb.org",
      "github_url": "https://github.com/DrugComb/DrugComb",
      "leaderboard_url": "https://drugcomb.org/leaderboard",
      "license": "CC-BY-NC-4.0",
      "first_release": "2025-01",
      "last_updated": "2026-03",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "experimental_validation": "retrospective",
      "flags": [],
      "notes": "Industry-relevant for combination oncology.",
      "related_benchmarks": [
        "tdc-drug-syn"
      ],
      "expert_ids": [],
      "group_ids": [
        "fimm-helsinki"
      ],
      "hosted_by": [
        "tdc"
      ],
      "composite_score": 83.0,
      "aliases": []
    },
    {
      "id": "protein-design-bench-2026",
      "name": "Protein Design Benchmark 2026",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "biologic"
      ],
      "task_types": [
        "binder-design",
        "structure-guided-generation"
      ],
      "description": "Standardized evaluation of de novo binder design (RFdiffusion, Chroma, ESM-IF) with wet-lab Kd measurement on 8 targets.",
      "size": {
        "designs": 43000,
        "targets": 8
      },
      "primary_paper": {
        "title": "Benchmarking de novo binder design 2026",
        "authors": [
          "Baker D",
          "Ovchinnikov S"
        ],
        "year": 2026,
        "doi": "10.1126/science.ade5026",
        "citations": 95
      },
      "official_url": "https://designbench.org",
      "github_url": "https://github.com/RosettaCommons/design-bench-2026",
      "leaderboard_url": "https://designbench.org/leaderboard",
      "license": "MIT",
      "first_release": "2026-01",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "experimental_validation": "wet-lab-confirmed",
      "flags": [],
      "notes": "All submitted designs characterized in IPD / external wet labs.",
      "related_benchmarks": [
        "proteingym",
        "sabdab"
      ],
      "expert_ids": [
        "david-baker"
      ],
      "group_ids": [
        "ipd-uw"
      ],
      "hosted_by": [],
      "composite_score": 97.0,
      "aliases": []
    },
    {
      "id": "gnnbench-drug",
      "name": "GNNBench-Drug 2026",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "property-prediction",
        "graph-benchmark"
      ],
      "description": "Head-to-head GNN / Transformer / equivariant-network benchmark on 11 drug-relevant tasks with harmonized splits.",
      "size": {
        "molecules": 3800000,
        "tasks": 11
      },
      "primary_paper": {
        "title": "GNNBench-Drug: a unified benchmark for molecular ML",
        "authors": [
          "Schwaller P",
          "et al."
        ],
        "year": 2025,
        "doi": "10.1038/s42256-025-01058-y",
        "citations": 110
      },
      "official_url": "https://gnnbench-drug.org",
      "github_url": "https://github.com/rxn4chemistry/gnnbench-drug",
      "leaderboard_url": "https://gnnbench-drug.org/leaderboard",
      "license": "Apache-2.0",
      "first_release": "2025-08",
      "last_updated": "2026-03",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "experimental_validation": "retrospective",
      "flags": [],
      "notes": "IBM-led; overlaps with MoleculeNet but adds modern splits.",
      "related_benchmarks": [
        "moleculenet",
        "polaris-admet"
      ],
      "expert_ids": [],
      "group_ids": [
        "ibm-rxn"
      ],
      "hosted_by": [],
      "composite_score": 85.6,
      "aliases": []
    },
    {
      "id": "crispr-outcome-bench",
      "name": "CRISPR Outcome Prediction Benchmark",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "gene-therapy"
      ],
      "task_types": [
        "edit-outcome-prediction",
        "off-target-prediction"
      ],
      "description": "Benchmark for CRISPR guide design and edit-outcome prediction across Cas9, Cas12a, prime editing, base editing.",
      "size": {
        "guides": 220000,
        "cell_lines": 17
      },
      "primary_paper": {
        "title": "Benchmarking CRISPR outcome prediction",
        "authors": [
          "Leenay R",
          "Pinello L"
        ],
        "year": 2025,
        "doi": "10.1038/s41551-025-01201-3",
        "citations": 46
      },
      "official_url": "https://crispr-bench.org",
      "github_url": "https://github.com/pinellolab/crispr-bench",
      "leaderboard_url": "https://crispr-bench.org/leaderboard",
      "license": "MIT",
      "first_release": "2025-06",
      "last_updated": "2026-02",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "experimental_validation": "wet-lab-confirmed",
      "flags": [],
      "notes": "Prospective track added in Q1 2026.",
      "related_benchmarks": [],
      "expert_ids": [],
      "group_ids": [
        "pinello-lab"
      ],
      "hosted_by": [
        "tdc"
      ],
      "composite_score": 79.5,
      "aliases": []
    },
    {
      "id": "cell-line-sensitivity-bench",
      "name": "Cell Line Sensitivity Benchmark (CLSB)",
      "stages": [
        "target-id",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "sensitivity-prediction"
      ],
      "description": "Cell line drug-sensitivity prediction across GDSC2, CCLE, and PRISM with standardized splits and matched molecular features.",
      "size": {
        "cell_lines": 1850,
        "compounds": 4500
      },
      "primary_paper": {
        "title": "Cell Line Sensitivity Benchmark 2025",
        "authors": [
          "Theodoris CV",
          "et al."
        ],
        "year": 2025,
        "doi": "10.1038/s41587-025-02015-5",
        "citations": 52
      },
      "official_url": "https://clsb.bio",
      "github_url": "https://github.com/clsb-bio/clsb",
      "leaderboard_url": "https://clsb.bio/leaderboard",
      "license": "CC-BY-4.0",
      "first_release": "2025-07",
      "last_updated": "2026-03",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "experimental_validation": "wet-lab-confirmed",
      "flags": [],
      "notes": "DepMap-adjacent but adds new splits and PRISM v4.",
      "related_benchmarks": [
        "depmap",
        "tdc-drug-syn"
      ],
      "expert_ids": [],
      "group_ids": [
        "broad-institute"
      ],
      "hosted_by": [],
      "composite_score": 88.1,
      "aliases": []
    },
    {
      "id": "protein-language-eval-2026",
      "name": "Protein Language Model Eval 2026",
      "stages": [
        "virtual-cell",
        "hit-id"
      ],
      "modalities": [
        "biologic"
      ],
      "task_types": [
        "zero-shot-fitness",
        "representation-quality"
      ],
      "description": "Consolidated evaluation harness for ESM, ProGen, xTrimoPGLM, Evo \u2014 37 zero-shot tasks spanning fitness, structure, function.",
      "size": {
        "tasks": 37,
        "proteins": 9800000
      },
      "primary_paper": {
        "title": "Protein LM Eval 2026: a community harness",
        "authors": [
          "Rives A",
          "Rao R",
          "et al."
        ],
        "year": 2026,
        "doi": "10.1101/2026.02.10.550123",
        "citations": 76
      },
      "official_url": "https://plmeval.org",
      "github_url": "https://github.com/facebookresearch/plm-eval",
      "leaderboard_url": "https://plmeval.org/leaderboard",
      "huggingface_url": "https://huggingface.co/datasets/facebook/plm-eval",
      "license": "MIT",
      "first_release": "2026-02",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "experimental_validation": "wet-lab-confirmed",
      "flags": [],
      "notes": "Meta FAIR + EvolutionaryScale collaboration; includes held-out targets with wet-lab fitness.",
      "related_benchmarks": [
        "proteingym",
        "tape",
        "peer",
        "flip"
      ],
      "expert_ids": [],
      "group_ids": [
        "meta-fair",
        "evolutionaryscale"
      ],
      "hosted_by": [],
      "composite_score": 100.0,
      "aliases": []
    },
    {
      "id": "dmpk-integrated-bench",
      "name": "DMPK Integrated Benchmark",
      "stages": [
        "lead-id-admet",
        "developmental-candidate"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "multi-objective-optimization",
        "pk-projection"
      ],
      "description": "Integrated DMPK benchmark evaluating multi-objective optimization jointly on solubility, metabolic stability, hERG, PPB, and rat PK.",
      "size": {
        "molecules": 45000
      },
      "primary_paper": {
        "title": "Integrated DMPK benchmarking 2025",
        "authors": [
          "Ghose AK",
          "et al."
        ],
        "year": 2025,
        "doi": "10.1021/acs.jcim.5c00342",
        "citations": 67
      },
      "official_url": "https://dmpk-bench.org",
      "github_url": "https://github.com/dmpk-bench/dmpk-bench",
      "leaderboard_url": "https://dmpk-bench.org/leaderboard",
      "license": "CC-BY-NC-4.0",
      "first_release": "2025-09",
      "last_updated": "2026-02",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "experimental_validation": "retrospective",
      "flags": [],
      "notes": "AZ/Merck/Pfizer contributed held-out test molecules.",
      "related_benchmarks": [
        "tdc-admet",
        "polaris-admet"
      ],
      "expert_ids": [],
      "group_ids": [
        "az-ai",
        "merck-ai",
        "pfizer-ai"
      ],
      "hosted_by": [],
      "composite_score": 82.5,
      "aliases": []
    },
    {
      "id": "clinbench-q2-2026",
      "name": "ClinBench Quarterly \u2014 Q2 2026",
      "stages": [
        "phase-ii",
        "phase-iii",
        "clinical-development"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "trial-outcome-prediction",
        "endpoint-modeling"
      ],
      "description": "Rolling quarterly clinical trial benchmark: eligibility extraction, outcome prediction, endpoint adjudication on Q2 2026 trial snapshot.",
      "size": {
        "trials": 4120,
        "endpoints": 11800
      },
      "primary_paper": {
        "title": "ClinBench: a rolling clinical trial benchmark",
        "authors": [
          "Insilico Clinical AI Team"
        ],
        "year": 2026,
        "doi": "10.1038/s41746-026-01045-2",
        "citations": 31
      },
      "official_url": "https://ddb.insilico.com/?category=Clinical+Trials",
      "github_url": "N/A",
      "leaderboard_url": "https://ddb.insilico.com/?category=Clinical+Trials",
      "license": "CC-BY-4.0",
      "first_release": "2025-10",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "experimental_validation": "clinical",
      "flags": [],
      "notes": "New track in Q2 2026 for endpoint adjudication.",
      "related_benchmarks": [
        "clinbench-quarterly",
        "hint-trialbench",
        "top-benchmark"
      ],
      "expert_ids": [],
      "group_ids": [
        "insilico-medicine"
      ],
      "hosted_by": [
        "insilico-ddb"
      ],
      "composite_score": 87.6,
      "aliases": []
    },
    {
      "id": "longevity-compound-bench",
      "name": "Longevity Compound Benchmark",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "geroprotective-activity",
        "lifespan-extension-prediction"
      ],
      "description": "Benchmark for predicting geroprotective activity from compound structure and cellular signatures. Curated against DrugAge + published lifespan extension studies.",
      "size": {
        "compounds": 1820
      },
      "primary_paper": {
        "title": "Longevity Compound Benchmark 2026",
        "authors": [
          "Aliper A",
          "Zhavoronkov A"
        ],
        "year": 2026,
        "doi": "10.1038/s43587-026-00401-x",
        "citations": 24
      },
      "official_url": "https://scienceaibench.insilico.com/?category=Biology&suite=Longevity",
      "github_url": "N/A",
      "leaderboard_url": "https://scienceaibench.insilico.com/?benchmark=bio-longevity-benchmark",
      "license": "CC-BY-4.0",
      "first_release": "2026-02",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "experimental_validation": "wet-lab-confirmed",
      "flags": [],
      "notes": "Insilico-hosted; unique in bridging cheminformatics and aging biology.",
      "related_benchmarks": [
        "longevity-bench-insilico"
      ],
      "expert_ids": [],
      "group_ids": [
        "insilico-medicine"
      ],
      "hosted_by": [
        "insilico-scienceaibench"
      ],
      "composite_score": 84.6,
      "aliases": []
    },
    {
      "id": "openbind-ev-a71",
      "name": "OpenBind EV-A71 Structure-Affinity Dataset",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "protein_structure",
        "small-molecule"
      ],
      "task_types": [
        "binding_affinity",
        "pose_prediction",
        "virtual_screening"
      ],
      "description": "First open dataset from the OpenBind consortium (Oxford/Diamond Light Source). Dense structure-affinity dataset for Enterovirus A71 2A protease: 925 crystallographic binding events from 699 compounds with affinity measurements for 601 compounds. Designed for benchmarking structure-based AI docking, cofolding, and affinity prediction.",
      "size": {
        "crystal_structures": 925,
        "affinity_measurements": 601
      },
      "primary_paper": {
        "title": "OpenBind's first release: a structure-affinity dataset for structure-based AI",
        "doi": null,
        "year": 2026,
        "citations": 0
      },
      "official_url": "https://openbind.uk/",
      "github_url": null,
      "leaderboard_url": null,
      "dataset_url": "https://zenodo.org/communities/openbind",
      "paper_url": "https://openbind.uk/news/blog-openbinds-first-release-a-structure-affinity-dataset-for-structure-based-ai/",
      "license": "CC-BY-4.0",
      "first_release": "2026-05",
      "last_updated": "2026-05",
      "rubric": {
        "rigor": 5,
        "coverage": 2,
        "maintenance": 5,
        "adoption": 3,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "composite_score": 84.8,
      "flags": [
        "experimental_validation",
        "crystallography"
      ],
      "notes": "One of the largest public single-target structure-affinity datasets. High-throughput crystallography at Diamond Light Source. Plans for more targets and blind challenges.",
      "related_benchmarks": [
        "casf-2016",
        "pdbbind",
        "posebusters"
      ],
      "experimental_validation": "wet-lab-confirmed",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "do-challenge-2025",
      "name": "DO Challenge 2025 (DeepOrigin Autonomous Drug Discovery)",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "ai_agent",
        "small-molecule"
      ],
      "task_types": [
        "virtual_screening",
        "active_learning",
        "agent_evaluation"
      ],
      "description": "Benchmark for autonomous AI agents in drug discovery. Agents must identify top 1,000 molecules from 1M conformations with limited budget (100K score queries). Tests ML-based sampling, strategic resource management, and code execution for autonomous discovery pipelines.",
      "size": {
        "molecular_conformations": 1000000,
        "query_budget": 100000
      },
      "primary_paper": {
        "title": "Can AI Agents Design and Implement Drug Discovery Pipelines?",
        "doi": "10.5281/zenodo.15296510",
        "year": 2025,
        "citations": 5
      },
      "official_url": "https://deeporigin.com/events/deep-origin-challenge-2025/",
      "github_url": null,
      "leaderboard_url": null,
      "dataset_url": "https://zenodo.org/records/15296510",
      "paper_url": "https://deeporigin.com/blog/benchmarking-and-development-of-ai-based-agentic-systems-for-autonomous-drug-discovery/",
      "license": "Apache-2.0",
      "first_release": "2025-03",
      "last_updated": "2025-05",
      "rubric": {
        "rigor": 4,
        "coverage": 2,
        "maintenance": 3,
        "adoption": 3,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "composite_score": 70.9,
      "flags": [
        "competition",
        "agent_benchmark"
      ],
      "notes": "First benchmark specifically for AI agents (not just models) in drug discovery. Multi-agent system 'Deep Thought' outperformed most human teams but underperformed expert solutions. Tests integrated pipeline design rather than isolated tasks.",
      "related_benchmarks": [],
      "experimental_validation": "none",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "x-atlas-orion",
      "name": "X-Atlas/Orion (Xaira Genome-wide Perturb-seq)",
      "stages": [
        "target-id"
      ],
      "modalities": [
        "genetic_perturbation",
        "single_cell"
      ],
      "task_types": [
        "perturbation_prediction",
        "gene_function",
        "foundation_model_training"
      ],
      "description": "Largest public genome-wide Perturb-seq dataset at release (June 2025). 8 million cells with >16,000 UMIs/cell (~10x deeper sequencing than other atlases). Features dose-dependent genetic effect detection via Xaira's FiCS platform. Designed for training biological foundation models.",
      "size": {
        "cells": 8000000,
        "description": "genome-wide perturbations"
      },
      "primary_paper": {
        "title": "X-Atlas/Orion: Genome-wide Perturb-seq Datasets via a Scalable Fix-Cryopreserve Platform for Training Dose-Dependent Biological Foundation Models",
        "doi": "10.1101/2025.06.11.659105",
        "year": 2025,
        "citations": 25
      },
      "official_url": "https://www.xaira.com/",
      "github_url": null,
      "leaderboard_url": null,
      "dataset_url": "https://huggingface.co/datasets/Xaira-Therapeutics/X-Atlas-Orion",
      "paper_url": "https://www.biorxiv.org/content/10.1101/2025.06.11.659105v1",
      "license": "CC-BY-4.0",
      "first_release": "2025-06",
      "last_updated": "2025-06",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "composite_score": 91.4,
      "flags": [
        "foundation_model_data",
        "industry_generated"
      ],
      "notes": "Generated by Xaira Therapeutics. Unprecedented sequencing depth enables detection of subtle perturbation effects. Superseded in scale by X-Atlas/Pisces (25.6M cells, March 2026).",
      "related_benchmarks": [
        "scperturb",
        "openproblems-perturbation",
        "x-atlas-pisces"
      ],
      "experimental_validation": "wet-lab-confirmed",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "x-atlas-pisces",
      "name": "X-Atlas/Pisces (25.6M Cell Multi-Context Perturb-seq)",
      "stages": [
        "target-id"
      ],
      "modalities": [
        "genetic_perturbation",
        "single_cell"
      ],
      "task_types": [
        "perturbation_prediction",
        "context_transfer",
        "foundation_model_training"
      ],
      "description": "Largest Perturb-seq dataset ever released: 25.6 million perturbed single-cell transcriptomes across 16 diverse biological contexts. Trained Xaira's X-Cell virtual cell model. Enables cross-context perturbation prediction and cell-type transfer learning.",
      "size": {
        "cells": 25600000,
        "biological_contexts": 16
      },
      "primary_paper": {
        "title": "X-Cell: A Virtual Cell Model Trained on X-Atlas/Pisces",
        "doi": null,
        "year": 2026,
        "citations": 10
      },
      "official_url": "https://www.xaira.com/",
      "github_url": null,
      "leaderboard_url": null,
      "dataset_url": "https://huggingface.co/datasets/Xaira-Therapeutics/X-Atlas-Pisces",
      "paper_url": "https://www.cdn.xaira.com/papers/X_CELL_V1_0316_final.pdf",
      "license": "CC-BY-4.0",
      "first_release": "2026-03",
      "last_updated": "2026-03",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "composite_score": 94.4,
      "flags": [
        "foundation_model_data",
        "industry_generated",
        "largest_in_class"
      ],
      "notes": "Successor to X-Atlas/Orion. 16 diverse biological contexts enable robust cross-context generalization. Underlies X-Cell foundation model. Industry-scale data release.",
      "related_benchmarks": [
        "x-atlas-orion",
        "scperturb",
        "openproblems-perturbation"
      ],
      "experimental_validation": "wet-lab-confirmed",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "flab2",
      "name": "FLAb2 (Fitness Landscape for Antibodies 2)",
      "stages": [
        "ind-enabling",
        "lead-id-admet"
      ],
      "modalities": [
        "antibody",
        "protein_sequence"
      ],
      "task_types": [
        "property_prediction",
        "developability",
        "fitness_prediction"
      ],
      "description": "Most extensive public benchmark for therapeutic antibody design. Developability assay data for >4 million antibodies from 32 studies covering 7 properties: thermostability, expression, aggregation, binding affinity, pharmacokinetics, polyreactivity, and immunogenicity.",
      "size": {
        "antibodies": 4000000,
        "studies": 32,
        "properties": 7
      },
      "primary_paper": {
        "title": "Fitness Landscape for Antibodies 2: Benchmarking Reveals That Protein AI Models Cannot Yet Consistently Predict Developability Properties",
        "doi": "10.64898/2025.12.27.696706",
        "year": 2025,
        "citations": 8
      },
      "official_url": "https://github.com/Graylab/FLAb",
      "github_url": "https://github.com/Graylab/FLAb",
      "leaderboard_url": null,
      "dataset_url": "https://registry.opendata.aws/flab/",
      "paper_url": "https://www.biorxiv.org/content/10.64898/2025.12.27.696706v1",
      "license": "MIT",
      "first_release": "2025-12",
      "last_updated": "2025-12",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "composite_score": 91.9,
      "flags": [
        "biologics",
        "multi_property"
      ],
      "notes": "Key finding: current protein AI models cannot consistently predict antibody developability. Critical for biologics pipeline. Covers therapeutically relevant properties beyond just binding affinity.",
      "related_benchmarks": [
        "sabdab",
        "therapeutic-antibody-bench-2026",
        "iglm-bench"
      ],
      "experimental_validation": "wet-lab-confirmed",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "nucleobench",
      "name": "NucleoBench",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "dna",
        "rna",
        "nucleic_acid"
      ],
      "task_types": [
        "sequence_design",
        "algorithm_comparison"
      ],
      "description": "Open-source benchmark for nucleic acid sequence design algorithms. Evaluates 9 design algorithms across 16 diverse biological tasks (gene expression control, transcription factor binding). Over 400,000 experiments. Led to development of AdaBeam hybrid algorithm.",
      "size": {
        "tasks": 16,
        "algorithms": 9,
        "experiments": 400000
      },
      "primary_paper": {
        "title": "NucleoBench: A Large-Scale Benchmark of Neural Nucleic Acid Design Algorithms",
        "doi": "10.1101/2025.06.20.660785",
        "year": 2025,
        "citations": 12
      },
      "official_url": "https://research.google/blog/smarter-nucleic-acid-design-with-nucleobench-and-adabeam/",
      "github_url": "https://github.com/move37-labs/nucleobench",
      "leaderboard_url": null,
      "dataset_url": null,
      "paper_url": "https://www.biorxiv.org/content/10.1101/2025.06.20.660785v3",
      "license": "Apache-2.0",
      "first_release": "2025-06",
      "last_updated": "2025-09",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "composite_score": 88.9,
      "flags": [
        "gene_therapy",
        "rna_therapeutics",
        "google_research"
      ],
      "notes": "First comprehensive benchmark for nucleic acid design. Google Research + Move37 Labs collaboration. Critical for CRISPR therapies, mRNA vaccines, and gene therapy design. AdaBeam outperforms on 11/16 tasks.",
      "related_benchmarks": [
        "mrna-design-bench"
      ],
      "experimental_validation": "retrospective",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "biodesignbench",
      "name": "BioDesignBench",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "protein_structure",
        "ai_agent",
        "protein_sequence"
      ],
      "task_types": [
        "protein_design",
        "agent_evaluation",
        "tool_use"
      ],
      "description": "Benchmark for evaluating LLM agents in protein design. 76 expert-curated tasks covering antibodies, enzymes, fluorescent proteins, binders, and scaffolds. Integrates AlphaFold, RFdiffusion, ProteinMPNN, Rosetta. Measures tool-use behavior and design quality.",
      "size": {
        "expert-curated_tasks": 76,
        "protein_categories": 5
      },
      "primary_paper": {
        "title": "BioDesignBench: Benchmarking LLM Agents for Protein Design",
        "doi": "10.64898/2026.05.06.723381",
        "year": 2026,
        "citations": 0
      },
      "official_url": null,
      "github_url": null,
      "leaderboard_url": null,
      "dataset_url": null,
      "paper_url": "https://www.biorxiv.org/content/10.64898/2026.05.06.723381v1",
      "license": "Unknown",
      "first_release": "2026-05",
      "last_updated": "2026-05",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 2,
        "quality": 5,
        "accessibility": 3,
        "industry_relevance": 5
      },
      "composite_score": 77.7,
      "flags": [
        "agent_benchmark",
        "protein_engineering"
      ],
      "notes": "Key finding: LLM agents select appropriate tools but evaluate designs superficially, rarely comparing alternatives. Strongest agents surpass hardcoded pipelines but underperform human experts. Enforcing deeper evaluation substantially improves performance.",
      "related_benchmarks": [
        "protein-design-bench-2026",
        "proteingym"
      ],
      "experimental_validation": "retrospective",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "pepadmet",
      "name": "pepADMET",
      "stages": [
        "ind-enabling",
        "lead-id-admet"
      ],
      "modalities": [
        "peptide",
        "small-molecule"
      ],
      "task_types": [
        "admet_prediction",
        "property_prediction"
      ],
      "description": "First comprehensive AI platform for peptide ADMET prediction. 36,643 high-quality data entries from open databases, covering 19-29 ADMET endpoints. Supports linear, cyclic, natural, and modified peptides across multiple species.",
      "size": {
        "entries": 643,
        "datasets": 26,
        "-29_endpoints": 19
      },
      "primary_paper": {
        "title": "pepADMET: A Comprehensive Platform for Peptide ADMET Prediction",
        "doi": null,
        "year": 2026,
        "citations": 3
      },
      "official_url": "https://ddai.tech/pepADMET",
      "github_url": null,
      "leaderboard_url": null,
      "dataset_url": null,
      "paper_url": "https://pubs.acs.org/doi/10.1021/acs.jcim.5c02518",
      "license": "Academic",
      "first_release": "2026-01",
      "last_updated": "2026-01",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 3,
        "adoption": 3,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "composite_score": 80.0,
      "flags": [
        "biologics",
        "peptide_therapeutics",
        "first_in_class"
      ],
      "notes": "Fills critical gap in peptide ADMET prediction (previous tools focused on small molecules). Covers Caco-2, PAMPA, BBB, half-life, toxicity. Supports modified peptides unlike most tools. Chinese research group.",
      "related_benchmarks": [
        "ism-admet",
        "polaris-admet"
      ],
      "experimental_validation": "retrospective",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "molgenbench",
      "name": "MolGenBench",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "molecular_generation",
        "lead_optimization",
        "de_novo_design"
      ],
      "description": "Comprehensive benchmark for molecular generation in real-world drug discovery, specifically addressing hit-to-lead (H2L) optimization. 220,005 experimentally confirmed active molecules across 120 targets and 5,433 chemical series. Novel pharmaceutically grounded metrics.",
      "size": {
        "molecules": 220000,
        "targets": 120,
        "entries": 5,
        "series": 433
      },
      "primary_paper": {
        "title": "Benchmarking Real-World Applicability of Molecular Generative Models from De novo Design to Lead Optimization with MolGenBench",
        "doi": "10.1101/2025.11.03.686215",
        "year": 2025,
        "citations": 6
      },
      "official_url": null,
      "github_url": null,
      "leaderboard_url": null,
      "dataset_url": null,
      "paper_url": "https://www.biorxiv.org/content/10.1101/2025.11.03.686215v2",
      "license": "Unknown",
      "first_release": "2025-11",
      "last_updated": "2025-11",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 3,
        "quality": 4,
        "accessibility": 3,
        "industry_relevance": 5
      },
      "composite_score": 78.2,
      "flags": [
        "hit_to_lead",
        "real_world_metrics"
      ],
      "notes": "Reveals significant gap between current generative model capabilities and real-world H2L demands. Novel metrics for target-specific active compound rediscovery and progressive potency optimization. Large experimentally confirmed dataset.",
      "related_benchmarks": [
        "guacamol",
        "moses",
        "pmo"
      ],
      "experimental_validation": "retrospective",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "ct-open",
      "name": "CT-Open (Live Clinical Trial Outcome Benchmark)",
      "stages": [
        "phase-iii",
        "phase-i",
        "phase-ii"
      ],
      "modalities": [
        "small-molecule",
        "text",
        "clinical_data"
      ],
      "task_types": [
        "clinical_trial_prediction",
        "outcome_forecasting"
      ],
      "description": "Open-access, live, leakage-free platform for benchmarking clinical trial outcome prediction. Issues new time-stamped benchmarks quarterly via automated expert-validated pipelines. LLM-powered decontamination ensures outcomes weren't public at prediction time.",
      "size": {
        "description": "growing dataset"
      },
      "primary_paper": {
        "title": "CT-Open: A Live Clinical Trial Outcome Benchmark Without Leakage",
        "doi": null,
        "year": 2026,
        "citations": 2
      },
      "official_url": "https://ct-open.net/",
      "github_url": null,
      "leaderboard_url": "https://ct-open.net/",
      "dataset_url": null,
      "paper_url": "https://arxiv.org/abs/2604.16742",
      "license": "CC-BY-4.0",
      "first_release": "2026-04",
      "last_updated": "2026-05",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 5,
        "adoption": 2,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "composite_score": 84.8,
      "flags": [
        "live_benchmark",
        "leakage_free",
        "clinical"
      ],
      "notes": "Presented at ICLR 2026. Addresses critical data contamination problem in clinical trial prediction. Quarterly cadence: Winter/Spring/Summer/Fall challenges. Fully automated decontamination pipeline using iterative LLM web searches.",
      "related_benchmarks": [
        "clinbench-quarterly",
        "hint-trialbench",
        "ctod"
      ],
      "experimental_validation": "retrospective",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "scperturbench",
      "name": "scPerturBench",
      "stages": [
        "target-id"
      ],
      "modalities": [
        "chemical_perturbation",
        "genetic_perturbation",
        "single_cell"
      ],
      "task_types": [
        "perturbation_prediction",
        "generalization",
        "method_comparison"
      ],
      "description": "Comprehensive benchmark of 27 single-cell perturbation response prediction methods across 29 datasets. Evaluates generalization to unseen perturbations, combinatorial interactions, and cross-cell-type transfer. Published in Nature Methods.",
      "size": {
        "methods": 27,
        "datasets": 29
      },
      "primary_paper": {
        "title": "Benchmarking algorithms for generalizable single-cell perturbation response prediction",
        "doi": "10.1038/s41592-025-02980-0",
        "year": 2025,
        "citations": 15
      },
      "official_url": "https://bm2-lab.github.io/scPerturBench-reproducibility/",
      "github_url": "https://github.com/bm2-lab/scPerturBench",
      "leaderboard_url": null,
      "dataset_url": null,
      "paper_url": "https://www.biorxiv.org/content/10.1101/2024.12.23.630036v1",
      "license": "MIT",
      "first_release": "2025-12",
      "last_updated": "2025-12",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "composite_score": 91.9,
      "flags": [
        "nature_methods",
        "comprehensive_comparison"
      ],
      "notes": "Published in Nature Methods (Vol 23, Issue 2). Most comprehensive evaluation of perturbation prediction methods. Covers both genetic and chemical perturbations. Web interface for exploring results. Chinese research group (Tongji University).",
      "related_benchmarks": [
        "perturbbench",
        "openproblems-perturbation",
        "x-atlas-orion"
      ],
      "experimental_validation": "retrospective",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "assaybench",
      "name": "AssayBench",
      "stages": [
        "target-id",
        "hit-id"
      ],
      "modalities": [
        "text",
        "genetic_perturbation",
        "single_cell"
      ],
      "task_types": [
        "phenotypic_screening",
        "gene_ranking",
        "llm_evaluation"
      ],
      "description": "Assay-level virtual cell benchmark for LLMs and agents. 1,920 CRISPR screens from BioGRID ORCS. Models predict gene rankings modulating phenotypes from free-text experimental descriptions. Tests LLM ability to perform in silico phenotypic screens.",
      "size": {
        "entries": 1,
        "CRISPR_screens": 920
      },
      "primary_paper": {
        "title": "AssayBench: An Assay-Level Virtual Cell Benchmark for LLMs and Agents",
        "doi": null,
        "year": 2026,
        "citations": 0
      },
      "official_url": null,
      "github_url": null,
      "leaderboard_url": null,
      "dataset_url": null,
      "paper_url": "https://arxiv.org/abs/2605.10876",
      "license": "Unknown",
      "first_release": "2026-05",
      "last_updated": "2026-05",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 2,
        "quality": 4,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "composite_score": 63.3,
      "flags": [
        "virtual_cell",
        "llm_benchmark",
        "phenotypic"
      ],
      "notes": "Novel framing: gene rank prediction from natural language experiment descriptions. Part of broader virtual cell revolution. Tests whether LLMs can replace actual CRISPR screens for hypothesis generation.",
      "related_benchmarks": [
        "virtualcellbench-2026",
        "cz-virtual-cell-challenge"
      ],
      "experimental_validation": "none",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "lsd-docking-6b",
      "name": "LSD Large-Scale Docking Database",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "protein_structure",
        "small-molecule"
      ],
      "task_types": [
        "virtual_screening",
        "docking",
        "scoring"
      ],
      "description": "Open-source dataset of 6.3 billion explicitly evaluated ligand-target docking pairs across 11 protein targets. Provides docking scores, SMILES, poses for top molecules, and in vitro validation results. Designed for ML model development and chemical space exploration.",
      "size": {
        "ligand-target_pairs": 6300000000,
        "targets": 11
      },
      "primary_paper": {
        "title": "A database for large-scale docking and experimental results",
        "doi": "10.1021/acs.jcim.5c00394",
        "year": 2025,
        "citations": 8
      },
      "official_url": "https://lsd.docking.org/",
      "github_url": null,
      "leaderboard_url": null,
      "dataset_url": "https://lsd.docking.org/",
      "paper_url": "https://pubs.acs.org/doi/abs/10.1021/acs.jcim.5c00394",
      "license": "CC-BY-4.0",
      "first_release": "2025-02",
      "last_updated": "2025-04",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 3,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "composite_score": 82.5,
      "flags": [
        "ultra_large_scale",
        "experimental_validation"
      ],
      "notes": "Unprecedented scale for public docking data. Includes experimental in vitro validation for subset. From UCSF Shoichet Lab. Critical for training ML scoring functions and active learning in virtual screening.",
      "related_benchmarks": [
        "dockstring",
        "lit-pcba",
        "dude"
      ],
      "experimental_validation": "wet-lab-confirmed",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "fgbench",
      "name": "FGBench (Functional Group Molecular Property Reasoning)",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "text",
        "small-molecule"
      ],
      "task_types": [
        "property_prediction",
        "llm_evaluation",
        "interpretability"
      ],
      "description": "Benchmark for molecular property reasoning at functional group level in LLMs. 625,000 problems enriched with FG annotations and positional info. 245 functional groups with regression and classification tasks. Tests structure-property understanding.",
      "size": {
        "problems": 625000,
        "functional_groups": 245
      },
      "primary_paper": {
        "title": "FGBench: A Dataset and Benchmark for Molecular Property Reasoning at Functional Group-Level in Large Language Models",
        "doi": null,
        "year": 2025,
        "citations": 5
      },
      "official_url": null,
      "github_url": "https://github.com/xuanliugit/FGBench",
      "leaderboard_url": null,
      "dataset_url": "https://github.com/xuanliugit/FGBench",
      "paper_url": "https://arxiv.org/abs/2508.01055",
      "license": "MIT",
      "first_release": "2025-08",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "composite_score": 77.0,
      "flags": [
        "neurips_2025",
        "llm_benchmark",
        "interpretability"
      ],
      "notes": "NeurIPS 2025 Datasets & Benchmarks Track. Reveals LLMs struggle with FG-level property reasoning. Addresses gap between molecular-level and substructure-level understanding. Framework for generating new chemistry QA pairs.",
      "related_benchmarks": [
        "moleculenet",
        "molecule-ace"
      ],
      "experimental_validation": "none",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "boom-ood",
      "name": "BOOM (Benchmarking Out-Of-Distribution Molecular Predictions)",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "property_prediction",
        "ood_generalization"
      ],
      "description": "Systematic benchmark evaluating OOD performance for molecular property prediction. Over 150 model-task combinations evaluated. Key finding: no model consistently achieves strong OOD generalization; average OOD error 3x larger than in-distribution.",
      "size": {
        "model-task_combinations": 150
      },
      "primary_paper": {
        "title": "BOOM: Benchmarking Out-Of-distribution Molecular Property Predictions of Machine Learning Models",
        "doi": null,
        "year": 2025,
        "citations": 7
      },
      "official_url": null,
      "github_url": "https://github.com/BOOM-benchmark",
      "leaderboard_url": null,
      "dataset_url": null,
      "paper_url": "https://arxiv.org/abs/2505.01912",
      "license": "MIT",
      "first_release": "2025-05",
      "last_updated": "2025-12",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "composite_score": 83.3,
      "flags": [
        "neurips_2025",
        "ood_evaluation",
        "critical_finding"
      ],
      "notes": "NeurIPS 2025. Critical benchmark showing the gap between in-distribution and OOD performance in molecular ML. Highly relevant for real-world drug discovery where novel chemotypes are the goal. Frontier challenge for chemical ML.",
      "related_benchmarks": [
        "moleculenet",
        "molecule-ace",
        "tdc-admet"
      ],
      "experimental_validation": "none",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "pdfbench",
      "name": "PDFBench (De Novo Protein Design from Function)",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "protein_structure",
        "text",
        "protein_sequence"
      ],
      "task_types": [
        "protein_design",
        "text_guided_generation"
      ],
      "description": "Unified benchmark for de novo protein design from functional descriptions. Supports description-guided and keyword-guided design tasks. 16-22 metrics across plausibility, foldability, language alignment, similarity, novelty, and diversity.",
      "size": {
        "description": "Multiple evaluation tasks",
        "-22_metrics": 16
      },
      "primary_paper": {
        "title": "PDFBench: A Benchmark for De novo Protein Design from Function",
        "doi": null,
        "year": 2025,
        "citations": 3
      },
      "official_url": "https://pdfbench.github.io/",
      "github_url": null,
      "leaderboard_url": null,
      "dataset_url": null,
      "paper_url": "https://arxiv.org/abs/2505.20346",
      "license": "Unknown",
      "first_release": "2025-05",
      "last_updated": "2025-05",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 2,
        "adoption": 2,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "composite_score": 68.9,
      "flags": [
        "text_guided_design",
        "multi_metric"
      ],
      "notes": "First unified benchmark for function-guided protein generation. Addresses comparison challenges from proprietary datasets and inconsistent metrics. Inter-metric correlation analysis provides guidelines for metric selection.",
      "related_benchmarks": [
        "protein-design-bench-2026",
        "proteingym",
        "biodesignbench"
      ],
      "experimental_validation": "none",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "benchbb",
      "name": "BenchBB (Bench-tested Binder Benchmark)",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "protein_structure",
        "protein_sequence"
      ],
      "task_types": [
        "protein_design",
        "binding_prediction",
        "experimental_validation"
      ],
      "description": "Curated set of 7 standardized protein targets for benchmarking computational binder design methods with wet-lab validation. From Adaptyv Bio. Provides consistent antigen targets for fair cross-method comparison with experimental testing infrastructure.",
      "size": {
        "antigen_targets": 7,
        "description": "standardized assays"
      },
      "primary_paper": {
        "title": "Crowdsourced Protein Design: Lessons From the Adaptyv EGFR Binder Competition",
        "doi": "10.1101/2025.04.17.648362",
        "year": 2025,
        "citations": 12
      },
      "official_url": "https://www.adaptyvbio.com/blog/benchbb/",
      "github_url": null,
      "leaderboard_url": "https://www.adaptyvbio.com/",
      "dataset_url": null,
      "paper_url": "https://www.biorxiv.org/content/10.1101/2025.04.17.648362v2",
      "license": "Open",
      "first_release": "2025-04",
      "last_updated": "2025-10",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "composite_score": 88.4,
      "flags": [
        "experimental_validation",
        "competition",
        "wet_lab"
      ],
      "notes": "Unique in providing actual wet-lab validation infrastructure. Adaptyv runs cloud lab for protein designers. EGFR competition attracted diverse computational methods. Boolean Biotech VHH Competition 2025 adopted BenchBB targets. Drives real experimental iteration.",
      "related_benchmarks": [
        "protein-design-bench-2026",
        "biodesignbench"
      ],
      "experimental_validation": "wet-lab-confirmed",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "cafa6",
      "name": "CAFA 6 (Critical Assessment of Function Annotation 6)",
      "stages": [
        "target-id"
      ],
      "modalities": [
        "protein_sequence"
      ],
      "task_types": [
        "function_prediction",
        "gene_ontology"
      ],
      "description": "6th iteration of the CAFA community challenge for protein function prediction on Kaggle. Predicts Gene Ontology terms (molecular function, biological process, cellular component) from amino acid sequences. Evaluated against experimentally validated annotations accumulated post-deadline.",
      "size": {
        "description": "GO annotations"
      },
      "primary_paper": {
        "title": "CAFA 6: Critical Assessment of Functional Annotation",
        "doi": null,
        "year": 2025,
        "citations": 5
      },
      "official_url": "https://www.kaggle.com/competitions/cafa-6-protein-function-prediction",
      "github_url": null,
      "leaderboard_url": "https://www.kaggle.com/competitions/cafa-6-protein-function-prediction/leaderboard",
      "dataset_url": "https://www.kaggle.com/competitions/cafa-6-protein-function-prediction/data",
      "paper_url": null,
      "license": "Competition",
      "first_release": "2025-01",
      "last_updated": "2026-05",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "composite_score": 97.5,
      "flags": [
        "competition",
        "kaggle",
        "time_delayed_evaluation"
      ],
      "notes": "Continuation of CAFA series (since 2010). Time-delayed evaluation prevents data leakage. Final evaluation May 2026 using annotations from UniProt Dec 2025/Jan 2026. Second iteration hosted on Kaggle.",
      "related_benchmarks": [
        "cafa5",
        "proteingym"
      ],
      "experimental_validation": "wet-lab-confirmed",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "posex",
      "name": "PoseX (Protein-Ligand Docking Benchmark)",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "protein_structure",
        "small-molecule"
      ],
      "task_types": [
        "docking",
        "pose_prediction"
      ],
      "description": "Open-source benchmark for protein-ligand docking evaluating 23 methods. 718 self-docking and 1,312 cross-docking entries from PDB structures 2022-2025 (prevents data leakage). Compares physics-based, AI docking, and AI co-folding approaches.",
      "size": {
        "self-dock_+_1": 718,
        "cross-dock_entries": 312,
        "methods": 23
      },
      "primary_paper": {
        "title": "PoseX: A Comprehensive Benchmark for Protein-Ligand Docking",
        "doi": null,
        "year": 2025,
        "citations": 10
      },
      "official_url": "http://dock-lab.tech/",
      "github_url": "https://github.com/cataai/posex",
      "leaderboard_url": "http://dock-lab.tech/",
      "dataset_url": "https://github.com/cataai/posex",
      "paper_url": "https://arxiv.org/abs/2505.01700",
      "license": "MIT",
      "first_release": "2025-05",
      "last_updated": "2025-12",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "composite_score": 91.4,
      "flags": [
        "leakage_prevention",
        "cross_docking",
        "leaderboard"
      ],
      "notes": "Key findings: AI surpasses physics-based docking overall; relaxation crucial for AI-generated poses; pocket specification boosts performance; some co-folding methods struggle with chirality. Uses only 2022-2025 PDB structures to prevent train/test contamination.",
      "related_benchmarks": [
        "posebusters",
        "casf-2016",
        "pdbbind"
      ],
      "experimental_validation": "retrospective",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "vsds-vd",
      "name": "VSDS-vd (Virtual Screening Decoy Set for Docking)",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "protein_structure",
        "small-molecule"
      ],
      "task_types": [
        "virtual_screening",
        "docking",
        "method_comparison"
      ],
      "description": "Benchmark for comparing AI-powered and physics-based docking tools from virtual screening perspective. Evaluates 4 AI docking tools, 4 physics-based tools, and 2 AI rescoring methods. From Zhejiang University. Proposes hierarchical screening strategy.",
      "size": {
        "description": "Multiple target-specific decoy sets"
      },
      "primary_paper": {
        "title": "Benchmarking AI-powered docking methods from the perspective of virtual screening",
        "doi": null,
        "year": 2025,
        "citations": 6
      },
      "official_url": null,
      "github_url": "https://github.com/shukai1997/VSDS-VD",
      "leaderboard_url": null,
      "dataset_url": "https://github.com/shukai1997/VSDS-VD",
      "paper_url": null,
      "license": "Open",
      "first_release": "2025-02",
      "last_updated": "2025-02",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 2,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "composite_score": 65.8,
      "flags": [
        "chinese_benchmark",
        "virtual_screening"
      ],
      "notes": "Chinese research benchmark (Zhejiang University). Finds AI methods show deficiencies in physical soundness of docked structures despite good VS performance. Proposes hierarchical strategy balancing speed and accuracy.",
      "related_benchmarks": [
        "dude",
        "dekois",
        "lit-pcba",
        "posex"
      ],
      "experimental_validation": "none",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "abbibench",
      "name": "AbBiBench (Antibody Binding Benchmark)",
      "stages": [
        "lead-id-admet"
      ],
      "modalities": [
        "protein_structure",
        "antibody"
      ],
      "task_types": [
        "binding_affinity",
        "affinity_maturation",
        "protein_design"
      ],
      "description": "Benchmark for antibody binding affinity maturation treating Ab-Ag complex as fundamental unit. 184,500+ experimental measurements across 9 antigen targets (influenza, SARS-CoV-2, HER2, VEGF). Compares language models, inverse folding, diffusion, and geometric models.",
      "size": {
        "entries": 184,
        "measurements": 500,
        "antigens": 9
      },
      "primary_paper": {
        "title": "AbBiBench: A Benchmark for Antibody Binding Affinity Maturation and Design",
        "doi": null,
        "year": 2025,
        "citations": 4
      },
      "official_url": null,
      "github_url": null,
      "leaderboard_url": null,
      "dataset_url": null,
      "paper_url": "https://arxiv.org/abs/2506.04235",
      "license": "Unknown",
      "first_release": "2025-05",
      "last_updated": "2025-10",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 3,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "composite_score": 80.8,
      "flags": [
        "biologics",
        "affinity_maturation"
      ],
      "notes": "Key finding: structure-conditioned inverse folding models outperform others for affinity prediction and generation. Treats complex holistically rather than antibody in isolation. Demonstrates utility in generative design for H1N1 antibody improvement.",
      "related_benchmarks": [
        "flab2",
        "sabdab",
        "therapeutic-antibody-bench-2026"
      ],
      "experimental_validation": "retrospective",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "dmpkbench-llm",
      "name": "DMPKBench (DMPK LLM Evaluation Benchmark)",
      "stages": [
        "ind-enabling",
        "lead-id-admet"
      ],
      "modalities": [
        "text",
        "tabular",
        "small-molecule"
      ],
      "task_types": [
        "llm_evaluation",
        "pk_prediction",
        "admet_prediction"
      ],
      "description": "Multi-modal benchmark for evaluating LLMs and agents in drug metabolism and pharmacokinetics. 120,000+ QA pairs covering experimental design, ADMET optimization, PK modeling, and preclinical-to-clinical translation. Includes SMILES, data tables, PK curves.",
      "size": {
        "QA_pairs": 120000,
        "competency_areas": 5
      },
      "primary_paper": {
        "title": "DMPKBench: A Comprehensive Multi-Modal Benchmark for DMPK LLM Evaluation",
        "doi": null,
        "year": 2025,
        "citations": 4
      },
      "official_url": null,
      "github_url": "https://github.com/GHDDI-AILab/DMPKBench",
      "leaderboard_url": null,
      "dataset_url": "https://github.com/GHDDI-AILab/DMPKBench",
      "paper_url": "https://openreview.net/forum?id=1NSnXVTxNR",
      "license": "Academic",
      "first_release": "2025-09",
      "last_updated": "2025-12",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 3,
        "adoption": 3,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "composite_score": 83.8,
      "flags": [
        "llm_benchmark",
        "multi_modal",
        "chinese_benchmark"
      ],
      "notes": "From GHDDI (Gates Foundation China). LLM accuracy ranges 11-89% across tasks. Models excel at knowledge tasks but struggle with multi-modal reasoning (PK curves, data tables). Critical for evaluating LLM utility in pharma DMPK departments.",
      "related_benchmarks": [
        "dmpk-integrated-bench",
        "ism-admet"
      ],
      "experimental_validation": "none",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "belka-del",
      "name": "BELKA (Big Encoded Library for Chemical Assessment)",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "dna_encoded_library",
        "small-molecule"
      ],
      "task_types": [
        "binding_prediction",
        "virtual_screening"
      ],
      "description": "Largest public DNA-encoded library (DEL) dataset: ~133M small molecules with 3.6B binding measurements against BRD4, sEH, and HSA. NeurIPS 2024 Kaggle competition. Includes library split for OOD evaluation. From Leash Biosciences.",
      "size": {
        "molecules": 133000000,
        "measurements": 3600000000,
        "targets": 3
      },
      "primary_paper": {
        "title": "Introducing BELKA: Big Encoded Library for Chemical Assessment",
        "doi": null,
        "year": 2024,
        "citations": 25
      },
      "official_url": "https://www.kaggle.com/competitions/leash-BELKA",
      "github_url": null,
      "leaderboard_url": "https://www.kaggle.com/competitions/leash-BELKA/leaderboard",
      "dataset_url": "https://polarishub.io/datasets/leash-bio/belka-v1",
      "paper_url": null,
      "license": "CC-BY-4.0",
      "first_release": "2024-04",
      "last_updated": "2024-10",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 3,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "composite_score": 82.5,
      "flags": [
        "neurips_2024",
        "kaggle",
        "ultra_large_scale",
        "competition"
      ],
      "notes": "NeurIPS 2024 competition. Unprecedented scale for public binding data. Library split tests true OOD generalization. DEL technology enables massive chemical space exploration. Now on Polaris Hub.",
      "related_benchmarks": [
        "lit-pcba",
        "dude",
        "lsd-docking-6b"
      ],
      "experimental_validation": "retrospective",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "cycpeptmpdb",
      "name": "CycPeptMPDB (Cyclic Peptide Membrane Permeability Database)",
      "stages": [
        "ind-enabling",
        "lead-id-admet"
      ],
      "modalities": [
        "peptide"
      ],
      "task_types": [
        "permeability_prediction",
        "property_prediction"
      ],
      "description": "Comprehensive database of experimentally measured membrane permeability for 7,991 structurally diverse cyclic peptides from 56 publications. Covers PAMPA, Caco-2, MDCK, RRCK assays. Actively used as benchmark for ML permeability prediction in 2025-2026.",
      "size": {
        "entries": 7,
        "cyclic_peptides": 991,
        "publications": 56
      },
      "primary_paper": {
        "title": "CycPeptMPDB: A Comprehensive Database of Membrane Permeability of Cyclic Peptides",
        "doi": "10.1021/acs.jcim.2c01573",
        "year": 2023,
        "citations": 62
      },
      "official_url": "http://cycpeptmpdb.com/",
      "github_url": null,
      "leaderboard_url": null,
      "dataset_url": "http://cycpeptmpdb.com/",
      "paper_url": "https://pubs.acs.org/doi/10.1021/acs.jcim.2c01573",
      "license": "Academic",
      "first_release": "2023-01",
      "last_updated": "2024-12",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "composite_score": 82.5,
      "flags": [
        "biologics",
        "peptide_therapeutics",
        "permeability"
      ],
      "notes": "De facto standard benchmark for cyclic peptide permeability prediction. Multiple 2025-2026 papers benchmark 13+ ML methods against this dataset. Critical for beyond-rule-of-5 drug discovery. Version 1.2 updated Dec 2024.",
      "related_benchmarks": [
        "pepadmet"
      ],
      "experimental_validation": "wet-lab-confirmed",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "mrnabench",
      "name": "mRNABench (mRNA Property Prediction Benchmark)",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "rna",
        "nucleic_acid"
      ],
      "task_types": [
        "property_prediction",
        "foundation_model_evaluation"
      ],
      "description": "Comprehensive benchmark for mRNA property prediction evaluating nucleotide foundation models. 10-11 curated datasets with 59-79 prediction tasks covering stability, translational efficiency, localization, and post-transcriptional regulation. 259,000 experiments.",
      "size": {
        "-11_datasets": 10,
        "-79_tasks": 59,
        "experiments": 259000
      },
      "primary_paper": {
        "title": "mRNABench: A Comprehensive Benchmark for mRNA Property Prediction",
        "doi": "10.1101/2025.07.05.662870",
        "year": 2025,
        "citations": 8
      },
      "official_url": null,
      "github_url": "https://github.com/morrislab/mRNABench",
      "leaderboard_url": null,
      "dataset_url": "https://github.com/morrislab/mRNABench",
      "paper_url": "https://www.biorxiv.org/content/10.1101/2025.07.05.662870v1",
      "license": "MIT",
      "first_release": "2025-07",
      "last_updated": "2025-07",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 3,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "composite_score": 86.3,
      "flags": [
        "rna_therapeutics",
        "foundation_model_eval"
      ],
      "notes": "Morris Lab (University of Toronto). First standardized benchmark for mRNA biology predictions. New Mamba-based model achieves SOTA with fewer parameters. Python package for easy extensibility. Critical for mRNA vaccine and therapy design.",
      "related_benchmarks": [
        "nucleobench",
        "mrna-design-bench"
      ],
      "experimental_validation": "retrospective",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "perturbarena",
      "name": "PerturbArena",
      "stages": [
        "target-id"
      ],
      "modalities": [
        "chemical_perturbation",
        "genetic_perturbation",
        "single_cell"
      ],
      "task_types": [
        "perturbation_prediction",
        "combinatorial_prediction",
        "cell_state_transfer"
      ],
      "description": "Comprehensive benchmark for comparing single-cell perturbation prediction models. 12 models + 3 baselines across 25 datasets with 24 metrics. Three core tasks: unseen perturbations, combinatorial perturbations, and cell state transfer across conditions.",
      "size": {
        "models": 12,
        "datasets": 25,
        "metrics": 24
      },
      "primary_paper": {
        "title": "PerturbArena: A Comprehensive Benchmark for Single-Cell Perturbation Prediction",
        "doi": "10.1101/2024.12.23.630036",
        "year": 2024,
        "citations": 12
      },
      "official_url": "https://luyitian.github.io/PerturbArena/",
      "github_url": null,
      "leaderboard_url": null,
      "dataset_url": null,
      "paper_url": "https://www.biorxiv.org/content/10.1101/2024.12.23.630036v2",
      "license": "Unknown",
      "first_release": "2024-12",
      "last_updated": "2025-06",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 3,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "composite_score": 74.4,
      "flags": [
        "virtual_cell",
        "multi_metric"
      ],
      "notes": "Complementary to scPerturBench. Emphasizes metric divergence analysis and practical method selection guidelines. Shows limited robustness to shifts across cellular contexts. Chinese research group.",
      "related_benchmarks": [
        "scperturbench",
        "openproblems-perturbation",
        "perturbbench"
      ],
      "experimental_validation": "none",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "scalebench-molprop",
      "name": "ScaleBench: Molecular Property Prediction",
      "aliases": [
        "Do Larger Models Really Win"
      ],
      "stages": [
        "lead-id-admet",
        "hit-id"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "classification",
        "regression"
      ],
      "description": "Comprehensive benchmark assessing whether larger pre-trained models outperform compact models across 26 ADME, toxicity, safety, and bioactivity endpoints with 78 endpoint-split combinations.",
      "size": {
        "endpoints": 26,
        "endpoint_split_entries": 78,
        "model_families": 4
      },
      "primary_paper": {
        "title": "Do Larger Models Really Win in Drug Discovery? A Benchmark Assessment of Model Scaling",
        "authors": [
          "Unknown \u2014 biorxiv preprint"
        ],
        "year": 2026,
        "doi": "10.64898/2026.04.29.721568",
        "arxiv": "N/A",
        "citations": 0
      },
      "official_url": "https://www.biorxiv.org/content/10.64898/2026.04.29.721568v2",
      "github_url": null,
      "leaderboard_url": null,
      "huggingface_url": null,
      "dataset_url": null,
      "paper_url": "https://www.biorxiv.org/content/10.64898/2026.04.29.721568v2",
      "license": "Other",
      "first_release": "2026-05-04",
      "last_updated": "2026-05-15",
      "maintainers": [
        {
          "name": "Unknown",
          "affiliation": "Unknown",
          "contact": "N/A"
        }
      ],
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 2,
        "quality": 4,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "composite_score": 69.5,
      "flags": [],
      "notes": "Timely study showing compact specialized models remain competitive vs. large foundation models for molecular property prediction. Key finding: performance depends on model-task-validation fit, not scale alone. Very early in adoption.",
      "related_benchmarks": [
        "tdc-admet",
        "boom-ood",
        "moleculenet"
      ],
      "experimental_validation": "retrospective",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "drugplayground",
      "name": "DrugPlayGround",
      "aliases": [
        "DPG"
      ],
      "stages": [
        "hit-id",
        "target-id",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "classification",
        "regression",
        "retrieval"
      ],
      "description": "Unified platform benchmarking LLMs and molecular embeddings across 4 drug discovery tasks: drug function analysis, drug-target interaction, synergistic combinations, and perturbation prediction.",
      "size": {
        "tasks": 4,
        "drugs_evaluated": "unknown \u2014 paper pending full release"
      },
      "primary_paper": {
        "title": "DrugPlayGround: Benchmarking Large Language Models and Embeddings for Drug Discovery",
        "authors": [
          "Tianyu Liu",
          "Sihan Jiang",
          "Fan Zhang",
          "Kunyang Sun",
          "Teresa Head-Gordon",
          "Hongyu Zhao"
        ],
        "year": 2026,
        "doi": "N/A",
        "arxiv": "2604.02346",
        "citations": 0
      },
      "official_url": "https://arxiv.org/abs/2604.02346",
      "github_url": null,
      "leaderboard_url": null,
      "huggingface_url": null,
      "dataset_url": null,
      "paper_url": "https://arxiv.org/abs/2604.02346",
      "license": "Other",
      "first_release": "2026-02-11",
      "last_updated": "2026-04-07",
      "maintainers": [
        {
          "name": "Tianyu Liu",
          "affiliation": "Yale / UC Berkeley",
          "contact": "N/A"
        },
        {
          "name": "Teresa Head-Gordon",
          "affiliation": "UC Berkeley",
          "contact": "N/A"
        }
      ],
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 2,
        "quality": 3,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "composite_score": 66.8,
      "flags": [],
      "notes": "First unified platform to benchmark both LLMs and molecular embeddings for drug discovery. Includes Head-Gordon lab (Berkeley) \u2014 reputable. Early days for adoption. Addresses a timely question.",
      "related_benchmarks": [
        "tdc-admet",
        "scalebench-molprop"
      ],
      "experimental_validation": "retrospective",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "cellbench-ls",
      "name": "CellBench-LS",
      "aliases": [
        "CellBench Low-Supervision"
      ],
      "stages": [
        "virtual-cell",
        "disease-modeling"
      ],
      "modalities": [
        "small-molecule",
        "biologic"
      ],
      "task_types": [
        "classification",
        "regression",
        "generation"
      ],
      "description": "Rigorous framework for evaluating generalization of single-cell foundation models in low-supervision scenarios across clustering, batch correction, annotation, expression reconstruction, and perturbation prediction.",
      "size": {
        "tasks": 5,
        "datasets": "multiple general + multi-batch + perturbation scRNA-seq",
        "models_evaluated": 10
      },
      "primary_paper": {
        "title": "CellBench-LS: Benchmarking Single-Cell Foundation Models in Low-Supervision Scenarios",
        "authors": [
          "Unknown \u2014 biorxiv preprint"
        ],
        "year": 2026,
        "doi": "10.64898/2026.04.01.714123",
        "arxiv": "N/A",
        "citations": 0
      },
      "official_url": "https://www.biorxiv.org/content/10.64898/2026.04.01.714123v1",
      "github_url": null,
      "leaderboard_url": null,
      "huggingface_url": null,
      "dataset_url": null,
      "paper_url": "https://www.biorxiv.org/content/10.64898/2026.04.01.714123v1",
      "license": "Other",
      "first_release": "2026-04-01",
      "last_updated": "2026-04-01",
      "maintainers": [
        {
          "name": "Unknown",
          "affiliation": "Unknown",
          "contact": "N/A"
        }
      ],
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 2,
        "quality": 4,
        "accessibility": 3,
        "industry_relevance": 3
      },
      "composite_score": 65.2,
      "flags": [],
      "notes": "Addresses critical gap: most scFM benchmarks use full supervision. Low-supervision evaluation is more realistic for clinical/translational settings. Finds scFMs don't consistently outperform simpler baselines.",
      "related_benchmarks": [
        "scperturbench",
        "openproblems-perturbation",
        "perturbarena"
      ],
      "experimental_validation": "retrospective",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "aws-jhu-ab-develop",
      "name": "AWS-JHU Antibody Developability Benchmark",
      "aliases": [
        "AWS Gray Lab Antibody Benchmark"
      ],
      "stages": [
        "developmental-candidate",
        "lead-id-admet"
      ],
      "modalities": [
        "biologic"
      ],
      "task_types": [
        "classification",
        "regression"
      ],
      "description": "First large-scale heterogeneous antibody developability dataset with 50 seed antibodies across 4 structural formats, 42 antigens, 6 developability traits, all wet-lab validated. Supports zero-shot evaluation.",
      "size": {
        "seed_antibodies": 50,
        "structural_formats": 4,
        "antigens": 42,
        "developability_traits": 6
      },
      "primary_paper": {
        "title": "AWS-JHU Antibody Developability Benchmark",
        "authors": [
          "Jeffrey Gray",
          "AWS AI Labs"
        ],
        "year": 2026,
        "doi": "N/A \u2014 paper pending",
        "arxiv": "N/A",
        "citations": 0
      },
      "official_url": "https://www.amazon.science/news/aws-gray-lab-johns-hopkins-announce-groundbreaking-database-for-ai-ml-antibody-design",
      "github_url": null,
      "leaderboard_url": null,
      "huggingface_url": null,
      "dataset_url": null,
      "paper_url": null,
      "license": "Other",
      "first_release": "2026-04-14",
      "last_updated": "2026-04-14",
      "maintainers": [
        {
          "name": "Jeffrey Gray",
          "affiliation": "Johns Hopkins University",
          "contact": "N/A"
        },
        {
          "name": "AWS AI Labs",
          "affiliation": "Amazon Web Services",
          "contact": "N/A"
        }
      ],
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 2,
        "quality": 5,
        "accessibility": 3,
        "industry_relevance": 5
      },
      "composite_score": 72.3,
      "flags": [],
      "notes": "Groundbreaking for antibody developability \u2014 fills gap where most benchmarks focus on binding only. Wet-lab validated ground truth across diverse formats. Zero-shot evaluation support is novel. AWS backing suggests long maintenance.",
      "related_benchmarks": [
        "flab2",
        "benchbb",
        "abbibench"
      ],
      "experimental_validation": "wet-lab-confirmed",
      "expert_ids": [],
      "group_ids": [],
      "hosted_by": []
    },
    {
      "id": "vibeproteinbench",
      "name": "VibeProteinBench (VPD-Bench)",
      "aliases": [
        "VPD-Bench"
      ],
      "stages": [
        "Target ID",
        "Developmental Candidate"
      ],
      "modalities": [
        "biologic",
        "small molecule"
      ],
      "task_types": [
        "generation",
        "classification",
        "retrieval"
      ],
      "description": "Language-interfaced benchmark for evaluating generalist capabilities in 'vibe protein design' \u2014 covers recognition, engineering, and generation stages using natural-language queries and in silico validation.",
      "size": {
        "tasks": 3,
        "categories": 3,
        "proteins": "unknown \u2014 multi-task evaluation suite"
      },
      "primary_paper": {
        "title": "VibeProteinBench: An Evaluation Benchmark for Language-interfaced Vibe Protein Design",
        "authors": [
          "Hyunjin Seo",
          "Hongjoon Ahn",
          "Jimin Park",
          "Sungjun Han",
          "Gyubok Lee",
          "Soojung Yang",
          "Joseph S Brown",
          "Leo Chen",
          "Gina El Nesr",
          "Feyisayo Eweje",
          "Sarah Gurev"
        ],
        "year": 2026,
        "doi": "N/A \u2014 preprint",
        "arxiv": "2605.10978",
        "citations": 0
      },
      "official_url": "https://arxiv.org/abs/2605.10978",
      "github_url": "N/A \u2014 not yet released",
      "leaderboard_url": "N/A",
      "huggingface_url": "N/A",
      "license": "Other",
      "first_release": "2026-05-09",
      "last_updated": "2026-05-18",
      "maintainers": [
        {
          "name": "Hyunjin Seo",
          "affiliation": "Unknown",
          "contact": "N/A"
        }
      ],
      "rubric": {
        "rigor": 3,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 1,
        "quality": 3,
        "accessibility": 2,
        "industry_relevance": 3
      },
      "composite_score": 43.2,
      "experimental_validation": "none",
      "flags": [],
      "notes": "Novel concept: first benchmark for 'vibe protein design' using natural language interfaces. Very new (May 2026), no community adoption yet. Evaluates LLM+protein model pipelines. Code/data availability unclear from preprint. Scores conservative pending community validation and data release.",
      "related_benchmarks": [
        "proteingym",
        "biodesignbench",
        "protein-design-bench-2026",
        "flip"
      ]
    },
    {
      "id": "compgen-mlip",
      "name": "CompGen-MLIP: Compositional Generalisation for ML Interatomic Potentials",
      "aliases": [
        "CompGen-MLIP"
      ],
      "stages": [
        "Hit ID",
        "Lead ID / ADMET"
      ],
      "modalities": [
        "small molecule"
      ],
      "task_types": [
        "regression"
      ],
      "description": "Benchmark with 4 tasks evaluating compositional generalization of ML interatomic potentials \u2014 whether models learn transferable chemistry vs. interpolating training patterns. Relevant to molecular dynamics-based drug design.",
      "size": {
        "tasks": 4,
        "molecules": "unknown \u2014 compositional split evaluation"
      },
      "primary_paper": {
        "title": "Benchmarking Compositional Generalisation for Machine Learning Interatomic Potentials",
        "authors": [
          "Amir Masoud Nourollah",
          "Irtaza Khalid",
          "Stefano Leoni",
          "Steven Schockaert"
        ],
        "year": 2026,
        "doi": "N/A \u2014 preprint",
        "arxiv": "2605.08988",
        "citations": 0
      },
      "official_url": "https://arxiv.org/abs/2605.08988",
      "github_url": "N/A \u2014 not yet released",
      "leaderboard_url": "N/A",
      "huggingface_url": "N/A",
      "license": "Other",
      "first_release": "2026-05-09",
      "last_updated": "2026-05-09",
      "maintainers": [
        {
          "name": "Amir Masoud Nourollah",
          "affiliation": "Cardiff University",
          "contact": "N/A"
        }
      ],
      "rubric": {
        "rigor": 4,
        "coverage": 2,
        "maintenance": 3,
        "adoption": 1,
        "quality": 3,
        "accessibility": 2,
        "industry_relevance": 2
      },
      "composite_score": 39.8,
      "experimental_validation": "retrospective",
      "flags": [],
      "notes": "Addresses important gap in MLIP evaluation \u2014 OOD generalization to unseen molecular compositions. Shows current models struggle (10x error on OOD). More computational chemistry than direct drug discovery, but relevant to free energy calculations and MD simulations used in drug design. Narrow scope (4 tasks only).",
      "related_benchmarks": [
        "matbench",
        "pdbbind",
        "casf-2016"
      ]
    },
    {
      "id": "mpp-foundation-survey-bench",
      "name": "MPP Foundation Model Benchmark",
      "aliases": [
        "MPP-FM-Bench",
        "DL-MPP Survey Benchmark"
      ],
      "stages": [
        "Lead ID / ADMET",
        "Hit ID"
      ],
      "modalities": [
        "small molecule"
      ],
      "task_types": [
        "classification",
        "regression"
      ],
      "description": "Comprehensive benchmark from systematic survey of deep learning for molecular property prediction in the foundation model era. Provides unified evaluation across 4 paradigms (descriptor-based, GNN, pretrained, foundation models) with standardized splits and metrics.",
      "size": {
        "datasets": "multiple standard (MoleculeNet, TDC subsets)",
        "models_evaluated": 30,
        "paradigms": 4
      },
      "primary_paper": {
        "title": "A Systematic Survey and Benchmark of Deep Learning for Molecular Property Prediction in the Foundation Model Era",
        "authors": [
          "Zongru Li",
          "Xingsheng Chen",
          "Honggang Wen",
          "Regina Qianru Zhang",
          "Ming Li",
          "Xiaojin Zhang",
          "Hongzhi Yin",
          "Qiang Yang",
          "Kwok-Yan Lam",
          "Pietro Lio",
          "Siu-Ming Yiu"
        ],
        "year": 2026,
        "doi": "N/A \u2014 preprint",
        "arxiv": "2604.16586",
        "citations": 0
      },
      "official_url": "https://arxiv.org/abs/2604.16586",
      "github_url": "N/A \u2014 not yet released",
      "leaderboard_url": "N/A",
      "huggingface_url": "N/A",
      "license": "Other",
      "first_release": "2026-04-17",
      "last_updated": "2026-04-17",
      "maintainers": [
        {
          "name": "Zongru Li",
          "affiliation": "University of Hong Kong",
          "contact": "N/A"
        },
        {
          "name": "Siu-Ming Yiu",
          "affiliation": "University of Hong Kong",
          "contact": "N/A"
        }
      ],
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 1,
        "quality": 4,
        "accessibility": 2,
        "industry_relevance": 3
      },
      "composite_score": 50.3,
      "experimental_validation": "retrospective",
      "flags": [],
      "notes": "Valuable meta-benchmark tracing evolution from descriptors to foundation models. Includes industry perspective and highlights evaluation protocol challenges. Multi-institutional (HKU, NTU, Cambridge). Released April 2026, no code/leaderboard yet. Higher potential than current scores suggest once code is released.",
      "related_benchmarks": [
        "moleculenet",
        "tdc-admet",
        "polaris-admet",
        "scalebench-molprop"
      ]
    },
    {
      "id": "foldbench",
      "name": "FoldBench",
      "aliases": [
        "FoldBench All-Atom"
      ],
      "stages": [
        "Hit ID"
      ],
      "modalities": [
        "small molecule",
        "biologic"
      ],
      "task_types": [
        "docking",
        "structure prediction"
      ],
      "description": "Comprehensive benchmark of 1,522 biological assemblies across 9 prediction tasks evaluating all-atom biomolecular structure prediction models including proteins, nucleic acids, ligands, and ions.",
      "size": {
        "assemblies": 1522,
        "tasks": 9,
        "splits": {
          "train": 0,
          "val": 0,
          "test": 1522
        }
      },
      "primary_paper": {
        "title": "Benchmarking all-atom biomolecular structure prediction with FoldBench",
        "authors": [
          "Shitong Xu",
          "Qian Feng",
          "Liang Qiao"
        ],
        "year": 2026,
        "doi": "10.1038/s41467-026-00442-x",
        "arxiv": "N/A",
        "citations": 12
      },
      "official_url": "https://github.com/FoldBench/FoldBench",
      "github_url": "https://github.com/FoldBench/FoldBench",
      "leaderboard_url": "N/A \u2014 evaluation scripts provided",
      "huggingface_url": "N/A",
      "license": "CC-BY",
      "first_release": "2025-05-27",
      "last_updated": "2026-03-15",
      "maintainers": [
        {
          "name": "Shitong Xu",
          "affiliation": "Peking University",
          "contact": "N/A"
        }
      ],
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 2,
        "quality": 4,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "composite_score": 55.8,
      "flags": [],
      "notes": "Published Nat Comms 2026. Covers 9 task types (monomer, multimer, nucleic acid, ligand, ion, antibody-antigen, etc). Revealed that ligand docking accuracy decreases with training set dissimilarity and antibody-antigen predictions fail >50%. Strong structural benchmark but still gaining community traction.",
      "related_benchmarks": [
        "posebusters",
        "plinder",
        "casp15"
      ]
    },
    {
      "id": "sair-dataset",
      "name": "SAIR",
      "aliases": [
        "Synthetic All-atom Interaction Representations"
      ],
      "stages": [
        "Hit ID"
      ],
      "modalities": [
        "small molecule"
      ],
      "task_types": [
        "docking",
        "classification",
        "regression"
      ],
      "description": "Large-scale synthetic structural dataset for protein-ligand interactions, enabling deep learning models to learn binding interactions without experimental structure biases. Published at ICLR 2026.",
      "size": {
        "complexes": 500000,
        "proteins": 15000,
        "molecules": 200000,
        "splits": {
          "train": 400000,
          "val": 50000,
          "test": 50000
        }
      },
      "primary_paper": {
        "title": "SAIR: Enabling Deep Learning for Protein-Ligand Interactions with a Synthetic Structural Dataset",
        "authors": [
          "Pablo Lemos",
          "Zane Beckwith",
          "Sasaank Bandi",
          "Maarten van Damme",
          "Jordan Crivelli-Decker",
          "Benjamin J. Shields",
          "Thomas Merth"
        ],
        "year": 2026,
        "doi": "10.1101/2025.06.17.660168",
        "arxiv": "N/A",
        "citations": 8
      },
      "official_url": "https://huggingface.co/datasets/SandboxAQ/SAIR",
      "github_url": "N/A \u2014 code in HuggingFace repo",
      "leaderboard_url": "N/A",
      "huggingface_url": "https://huggingface.co/datasets/SandboxAQ/SAIR",
      "license": "CC-BY-NC",
      "first_release": "2025-06-17",
      "last_updated": "2026-04-25",
      "maintainers": [
        {
          "name": "Pablo Lemos",
          "affiliation": "SandboxAQ",
          "contact": "N/A"
        }
      ],
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 2,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "composite_score": 51.2,
      "flags": [
        "self_referential"
      ],
      "notes": "ICLR 2026 paper from SandboxAQ. Synthetic data approach avoids PDB biases but raises generalizability questions. Self-referential flag: SandboxAQ models dominate evaluations. Dataset available on HuggingFace. Key insight: synthetic data can match/exceed experimental for binding pose prediction.",
      "related_benchmarks": [
        "posebusters",
        "plinder",
        "foldbench"
      ]
    },
    {
      "id": "openadmet-avoidome",
      "name": "OpenADMET / Avoid-ome",
      "aliases": [
        "OpenADMET",
        "Avoid-ome benchmark"
      ],
      "stages": [
        "Lead ID / ADMET",
        "IND-enabling"
      ],
      "modalities": [
        "small molecule"
      ],
      "task_types": [
        "classification",
        "regression"
      ],
      "description": "Open-science initiative creating pre-competitive mechanistic ADMET datasets targeting the 'Avoid-ome' \u2014 proteins acting as anti-targets. Uses high-throughput structural biology and active learning with community challenges.",
      "size": {
        "proteins": 50,
        "molecules": 100000,
        "splits": {
          "train": 0,
          "val": 0,
          "test": 0
        }
      },
      "primary_paper": {
        "title": "Mapping the avoid-ome: a systematic open-science approach to predictive ADMET",
        "authors": [
          "James S. Fraser",
          "Steven Edgar",
          "L. Naomi Handly",
          "Sriram Kosuri",
          "John D. Chodera",
          "Mark Murcko",
          "W. Patrick Walters"
        ],
        "year": 2026,
        "doi": "10.1038/s41467-026-73410-8",
        "arxiv": "N/A",
        "citations": 0
      },
      "official_url": "https://www.nature.com/articles/s41467-026-73410-8",
      "github_url": "N/A \u2014 datasets forthcoming",
      "leaderboard_url": "N/A \u2014 community challenges planned",
      "huggingface_url": "N/A",
      "license": "CC-BY",
      "first_release": "2026-05-25",
      "last_updated": "2026-05-25",
      "maintainers": [
        {
          "name": "James S. Fraser",
          "affiliation": "UCSF",
          "contact": "N/A"
        },
        {
          "name": "John D. Chodera",
          "affiliation": "MSKCC",
          "contact": "N/A"
        },
        {
          "name": "W. Patrick Walters",
          "affiliation": "Relay Therapeutics",
          "contact": "N/A"
        }
      ],
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 2,
        "quality": 4,
        "accessibility": 3,
        "industry_relevance": 5
      },
      "composite_score": 53.6,
      "flags": [],
      "notes": "Nat Comms perspective (May 2026) by top structural biology/comp chem leaders (Fraser, Chodera, Murcko, Walters). Proposes mechanistic ADMET datasets grounded in structural 'ground truth'. Still early \u2014 datasets forthcoming \u2014 but the consortium authority and Nature publication warrant inclusion. High industry relevance given Relay/MSKCC/UCSF backing.",
      "related_benchmarks": [
        "tdc-admet",
        "toxcast"
      ]
    }
  ]
}