A10.1 BioAssay Screening: Enough Actives

A10.1 BioAssay Screening: Enough Actives#

import requests

def get_assay_outcome_counts(aid, count_basis="CID"):
    """
    Retrieve Active/Inactive/Inconclusive/Total counts for a PubChem BioAssay AID.

    Parameters
    ----------
    aid : int
        PubChem Assay ID.
    count_basis : str
        "CID" (default) counts unique compounds (recommended for cheminformatics/ML),
        or "SID" counts submitted substances.

    Returns
    -------
    dict
        Counts and basis used.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{aid}/summary/JSON"
    r = requests.get(url, timeout=30)
    r.raise_for_status()

    summary = r.json()["AssaySummaries"]["AssaySummary"][0]

    basis = count_basis.upper()
    if basis not in {"CID", "SID"}:
        raise ValueError("count_basis must be 'CID' or 'SID'")

    # Preferred schema (what you are seeing)
    if basis == "CID" and "CIDCountAll" in summary:
        return {
            "Basis": "CID",
            "Total": summary.get("CIDCountAll", 0),
            "Active": summary.get("CIDCountActive", 0),
            "Inactive": summary.get("CIDCountInactive", 0),
            "Inconclusive": summary.get("CIDCountInconclusive", 0),
            "Unspecified": summary.get("CIDCountUnspecified", 0),
            "Probe": summary.get("CIDCountProbe", 0),
        }

    if basis == "SID" and "SIDCountAll" in summary:
        return {
            "Basis": "SID",
            "Total": summary.get("SIDCountAll", 0),
            "Active": summary.get("SIDCountActive", 0),
            "Inactive": summary.get("SIDCountInactive", 0),
            "Inconclusive": summary.get("SIDCountInconclusive", 0),
            "Unspecified": summary.get("SIDCountUnspecified", 0),
            "Probe": summary.get("SIDCountProbe", 0),
        }

    # Fallback schema (less common, but we handle it)
    # Try a couple of plausible alternatives if PubChem changes formatting.
    alt_map = {
        "Total": ["TotalCount", "CIDCount", "SIDCount"],
        "Active": ["ActiveCount"],
        "Inactive": ["InactiveCount"],
        "Inconclusive": ["InconclusiveCount"],
    }

    out = {"Basis": basis}
    for k, candidates in alt_map.items():
        out[k] = 0
        for c in candidates:
            if c in summary:
                out[k] = summary.get(c, 0)
                break

    return out

counts_cid = get_assay_outcome_counts(743139, "CID")
counts_sid = get_assay_outcome_counts(743139, "SID")

print("CID counts:", counts_cid)
print("SID counts:", counts_sid)

CID counts: {'Basis': 'CID', 'Total': 8099, 'Active': 326, 'Inactive': 6114, 'Inconclusive': 2092, 'Unspecified': 0, 'Probe': 0}
SID counts: {'Basis': 'SID', 'Total': 10486, 'Active': 379, 'Inactive': 7562, 'Inconclusive': 2545, 'Unspecified': 0, 'Probe': 0}

import requests

def get_assay_summary(aid):
    """
    Retrieve assay name and outcome counts (CID and SID)
    for a PubChem BioAssay AID.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{aid}/summary/JSON"
    r = requests.get(url, timeout=30)
    r.raise_for_status()

    s = r.json()["AssaySummaries"]["AssaySummary"][0]

    return {
        "AID": aid,
        "AssayName": s.get("Name", ""),
        
        # CID counts (preferred for ML)
        "CID_Total": s.get("CIDCountAll", 0),
        "CID_Active": s.get("CIDCountActive", 0),
        "CID_Inactive": s.get("CIDCountInactive", 0),
        "CID_Inconclusive": s.get("CIDCountInconclusive", 0),

        # SID counts (for completeness)
        "SID_Total": s.get("SIDCountAll", 0),
        "SID_Active": s.get("SIDCountActive", 0),
        "SID_Inactive": s.get("SIDCountInactive", 0),
        "SID_Inconclusive": s.get("SIDCountInconclusive", 0),
    }

import pandas as pd

candidate_aids = [
    743040,  # ERα agonist
    743077,  # ERα antagonist
    720552,  # AR antagonist
    720551,  # AR agonist
    743122,  # AhR activation
    743065,  # PPARγ
    743067,  # PPARδ
    743053,  # NRF2 antioxidant response
]

rows = []
for aid in candidate_aids:
    try:
        rows.append(get_assay_summary(aid))
    except Exception as e:
        rows.append({
            "AID": aid,
            "AssayName": "ERROR",
            "Error": str(e)
        })

df = pd.DataFrame(rows)
df

	AID	AssayName	CID_Total	CID_Active	CID_Inactive	CID_Inconclusive	SID_Total	SID_Active	SID_Inactive	SID_Inconclusive
0	743040	qHTS assay to identify small molecule agonists...	8099	341	7706	201	10486	461	9812	213
1	743077	qHTS assay to identify small molecule agonists...	8099	438	7303	736	10486	589	9089	808
2	720552	qHTS assay for small molecule agonists of the ...	8099	527	6960	816	10488	659	8888	941
3	720551	qHTS for Inhibitors of KCHN2 3.1: Wildtype qHTS	343666	1267	341674	730	343909	1267	341912	730
4	743122	qHTS assay to identify small molecule that act...	8099	875	6397	1357	10486	1063	7945	1478
5	743065	qHTS assay to identify small molecule antagoni...	8099	1536	5937	912	10486	1880	7568	1038
6	743067	qHTS assay to identify small molecule antagoni...	8099	378	5631	2491	10486	426	7088	2972
7	743053	qHTS assay to identify small molecule agonists...	8099	260	7168	918	10486	372	9070	1044

df["CID_ActiveFraction"] = df["CID_Active"] / df["CID_Total"]
df["CID_InconclusiveFraction"] = df["CID_Inconclusive"] / df["CID_Total"]

df[[
    "AID",
    "AssayName",
    "CID_Total",
    "CID_Active",
    "CID_ActiveFraction",
    "CID_InconclusiveFraction"
]].sort_values("CID_ActiveFraction", ascending=False)

	AID	AssayName	CID_Total	CID_Active	CID_ActiveFraction	CID_InconclusiveFraction
5	743065	qHTS assay to identify small molecule antagoni...	8099	1536	0.189653	0.112606
4	743122	qHTS assay to identify small molecule that act...	8099	875	0.108038	0.167552
2	720552	qHTS assay for small molecule agonists of the ...	8099	527	0.065070	0.100753
1	743077	qHTS assay to identify small molecule agonists...	8099	438	0.054081	0.090875
6	743067	qHTS assay to identify small molecule antagoni...	8099	378	0.046672	0.307569
0	743040	qHTS assay to identify small molecule agonists...	8099	341	0.042104	0.024818
7	743053	qHTS assay to identify small molecule agonists...	8099	260	0.032103	0.113347
3	720551	qHTS for Inhibitors of KCHN2 3.1: Wildtype qHTS	343666	1267	0.003687	0.002124

def suitability(row):
    if row["CID_Total"] < 5000:
        return "Too Small"
    if row["CID_Active"] < 200:
        return "Too Few Actives"
    if row["CID_ActiveFraction"] < 0.01:
        return "Advanced / Highly Imbalanced"
    if row["CID_ActiveFraction"] < 0.05:
        return "Intermediate"
    return "Intro-Friendly"

df["SuggestedUse"] = df.apply(suitability, axis=1)
df

	AID	AssayName	CID_Total	CID_Active	CID_Inactive	CID_Inconclusive	SID_Total	SID_Active	SID_Inactive	SID_Inconclusive	CID_ActiveFraction	CID_InconclusiveFraction	SuggestedUse
0	743040	qHTS assay to identify small molecule agonists...	8099	341	7706	201	10486	461	9812	213	0.042104	0.024818	Intermediate
1	743077	qHTS assay to identify small molecule agonists...	8099	438	7303	736	10486	589	9089	808	0.054081	0.090875	Intro-Friendly
2	720552	qHTS assay for small molecule agonists of the ...	8099	527	6960	816	10488	659	8888	941	0.065070	0.100753	Intro-Friendly
3	720551	qHTS for Inhibitors of KCHN2 3.1: Wildtype qHTS	343666	1267	341674	730	343909	1267	341912	730	0.003687	0.002124	Advanced / Highly Imbalanced
4	743122	qHTS assay to identify small molecule that act...	8099	875	6397	1357	10486	1063	7945	1478	0.108038	0.167552	Intro-Friendly
5	743065	qHTS assay to identify small molecule antagoni...	8099	1536	5937	912	10486	1880	7568	1038	0.189653	0.112606	Intro-Friendly
6	743067	qHTS assay to identify small molecule antagoni...	8099	378	5631	2491	10486	426	7088	2972	0.046672	0.307569	Intermediate
7	743053	qHTS assay to identify small molecule agonists...	8099	260	7168	918	10486	372	9070	1044	0.032103	0.113347	Intermediate