A10.1 BioAssay Screening: Enough Actives

A10.1 BioAssay Screening: Enough Actives#

import requests

def get_assay_outcome_counts(aid, count_basis="CID"):
    """
    Retrieve Active/Inactive/Inconclusive/Total counts for a PubChem BioAssay AID.

    Parameters
    ----------
    aid : int
        PubChem Assay ID.
    count_basis : str
        "CID" (default) counts unique compounds (recommended for cheminformatics/ML),
        or "SID" counts submitted substances.

    Returns
    -------
    dict
        Counts and basis used.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{aid}/summary/JSON"
    r = requests.get(url, timeout=30)
    r.raise_for_status()

    summary = r.json()["AssaySummaries"]["AssaySummary"][0]

    basis = count_basis.upper()
    if basis not in {"CID", "SID"}:
        raise ValueError("count_basis must be 'CID' or 'SID'")

    # Preferred schema (what you are seeing)
    if basis == "CID" and "CIDCountAll" in summary:
        return {
            "Basis": "CID",
            "Total": summary.get("CIDCountAll", 0),
            "Active": summary.get("CIDCountActive", 0),
            "Inactive": summary.get("CIDCountInactive", 0),
            "Inconclusive": summary.get("CIDCountInconclusive", 0),
            "Unspecified": summary.get("CIDCountUnspecified", 0),
            "Probe": summary.get("CIDCountProbe", 0),
        }

    if basis == "SID" and "SIDCountAll" in summary:
        return {
            "Basis": "SID",
            "Total": summary.get("SIDCountAll", 0),
            "Active": summary.get("SIDCountActive", 0),
            "Inactive": summary.get("SIDCountInactive", 0),
            "Inconclusive": summary.get("SIDCountInconclusive", 0),
            "Unspecified": summary.get("SIDCountUnspecified", 0),
            "Probe": summary.get("SIDCountProbe", 0),
        }

    # Fallback schema (less common, but we handle it)
    # Try a couple of plausible alternatives if PubChem changes formatting.
    alt_map = {
        "Total": ["TotalCount", "CIDCount", "SIDCount"],
        "Active": ["ActiveCount"],
        "Inactive": ["InactiveCount"],
        "Inconclusive": ["InconclusiveCount"],
    }

    out = {"Basis": basis}
    for k, candidates in alt_map.items():
        out[k] = 0
        for c in candidates:
            if c in summary:
                out[k] = summary.get(c, 0)
                break

    return out
counts_cid = get_assay_outcome_counts(743139, "CID")
counts_sid = get_assay_outcome_counts(743139, "SID")

print("CID counts:", counts_cid)
print("SID counts:", counts_sid)
CID counts: {'Basis': 'CID', 'Total': 8099, 'Active': 326, 'Inactive': 6114, 'Inconclusive': 2092, 'Unspecified': 0, 'Probe': 0}
SID counts: {'Basis': 'SID', 'Total': 10486, 'Active': 379, 'Inactive': 7562, 'Inconclusive': 2545, 'Unspecified': 0, 'Probe': 0}
import requests

def get_assay_summary(aid):
    """
    Retrieve assay name and outcome counts (CID and SID)
    for a PubChem BioAssay AID.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{aid}/summary/JSON"
    r = requests.get(url, timeout=30)
    r.raise_for_status()

    s = r.json()["AssaySummaries"]["AssaySummary"][0]

    return {
        "AID": aid,
        "AssayName": s.get("Name", ""),
        
        # CID counts (preferred for ML)
        "CID_Total": s.get("CIDCountAll", 0),
        "CID_Active": s.get("CIDCountActive", 0),
        "CID_Inactive": s.get("CIDCountInactive", 0),
        "CID_Inconclusive": s.get("CIDCountInconclusive", 0),

        # SID counts (for completeness)
        "SID_Total": s.get("SIDCountAll", 0),
        "SID_Active": s.get("SIDCountActive", 0),
        "SID_Inactive": s.get("SIDCountInactive", 0),
        "SID_Inconclusive": s.get("SIDCountInconclusive", 0),
    }
import pandas as pd

candidate_aids = [
    743040,  # ERα agonist
    743077,  # ERα antagonist
    720552,  # AR antagonist
    720551,  # AR agonist
    743122,  # AhR activation
    743065,  # PPARγ
    743067,  # PPARδ
    743053,  # NRF2 antioxidant response
]

rows = []
for aid in candidate_aids:
    try:
        rows.append(get_assay_summary(aid))
    except Exception as e:
        rows.append({
            "AID": aid,
            "AssayName": "ERROR",
            "Error": str(e)
        })

df = pd.DataFrame(rows)
df
AID AssayName CID_Total CID_Active CID_Inactive CID_Inconclusive SID_Total SID_Active SID_Inactive SID_Inconclusive
0 743040 qHTS assay to identify small molecule agonists... 8099 341 7706 201 10486 461 9812 213
1 743077 qHTS assay to identify small molecule agonists... 8099 438 7303 736 10486 589 9089 808
2 720552 qHTS assay for small molecule agonists of the ... 8099 527 6960 816 10488 659 8888 941
3 720551 qHTS for Inhibitors of KCHN2 3.1: Wildtype qHTS 343666 1267 341674 730 343909 1267 341912 730
4 743122 qHTS assay to identify small molecule that act... 8099 875 6397 1357 10486 1063 7945 1478
5 743065 qHTS assay to identify small molecule antagoni... 8099 1536 5937 912 10486 1880 7568 1038
6 743067 qHTS assay to identify small molecule antagoni... 8099 378 5631 2491 10486 426 7088 2972
7 743053 qHTS assay to identify small molecule agonists... 8099 260 7168 918 10486 372 9070 1044
df["CID_ActiveFraction"] = df["CID_Active"] / df["CID_Total"]
df["CID_InconclusiveFraction"] = df["CID_Inconclusive"] / df["CID_Total"]

df[[
    "AID",
    "AssayName",
    "CID_Total",
    "CID_Active",
    "CID_ActiveFraction",
    "CID_InconclusiveFraction"
]].sort_values("CID_ActiveFraction", ascending=False)
AID AssayName CID_Total CID_Active CID_ActiveFraction CID_InconclusiveFraction
5 743065 qHTS assay to identify small molecule antagoni... 8099 1536 0.189653 0.112606
4 743122 qHTS assay to identify small molecule that act... 8099 875 0.108038 0.167552
2 720552 qHTS assay for small molecule agonists of the ... 8099 527 0.065070 0.100753
1 743077 qHTS assay to identify small molecule agonists... 8099 438 0.054081 0.090875
6 743067 qHTS assay to identify small molecule antagoni... 8099 378 0.046672 0.307569
0 743040 qHTS assay to identify small molecule agonists... 8099 341 0.042104 0.024818
7 743053 qHTS assay to identify small molecule agonists... 8099 260 0.032103 0.113347
3 720551 qHTS for Inhibitors of KCHN2 3.1: Wildtype qHTS 343666 1267 0.003687 0.002124
def suitability(row):
    if row["CID_Total"] < 5000:
        return "Too Small"
    if row["CID_Active"] < 200:
        return "Too Few Actives"
    if row["CID_ActiveFraction"] < 0.01:
        return "Advanced / Highly Imbalanced"
    if row["CID_ActiveFraction"] < 0.05:
        return "Intermediate"
    return "Intro-Friendly"

df["SuggestedUse"] = df.apply(suitability, axis=1)
df
AID AssayName CID_Total CID_Active CID_Inactive CID_Inconclusive SID_Total SID_Active SID_Inactive SID_Inconclusive CID_ActiveFraction CID_InconclusiveFraction SuggestedUse
0 743040 qHTS assay to identify small molecule agonists... 8099 341 7706 201 10486 461 9812 213 0.042104 0.024818 Intermediate
1 743077 qHTS assay to identify small molecule agonists... 8099 438 7303 736 10486 589 9089 808 0.054081 0.090875 Intro-Friendly
2 720552 qHTS assay for small molecule agonists of the ... 8099 527 6960 816 10488 659 8888 941 0.065070 0.100753 Intro-Friendly
3 720551 qHTS for Inhibitors of KCHN2 3.1: Wildtype qHTS 343666 1267 341674 730 343909 1267 341912 730 0.003687 0.002124 Advanced / Highly Imbalanced
4 743122 qHTS assay to identify small molecule that act... 8099 875 6397 1357 10486 1063 7945 1478 0.108038 0.167552 Intro-Friendly
5 743065 qHTS assay to identify small molecule antagoni... 8099 1536 5937 912 10486 1880 7568 1038 0.189653 0.112606 Intro-Friendly
6 743067 qHTS assay to identify small molecule antagoni... 8099 378 5631 2491 10486 426 7088 2972 0.046672 0.307569 Intermediate
7 743053 qHTS assay to identify small molecule agonists... 8099 260 7168 918 10486 372 9070 1044 0.032103 0.113347 Intermediate