A10.1 BioAssay Screening: Enough Actives#
import requests
def get_assay_outcome_counts(aid, count_basis="CID"):
"""
Retrieve Active/Inactive/Inconclusive/Total counts for a PubChem BioAssay AID.
Parameters
----------
aid : int
PubChem Assay ID.
count_basis : str
"CID" (default) counts unique compounds (recommended for cheminformatics/ML),
or "SID" counts submitted substances.
Returns
-------
dict
Counts and basis used.
"""
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{aid}/summary/JSON"
r = requests.get(url, timeout=30)
r.raise_for_status()
summary = r.json()["AssaySummaries"]["AssaySummary"][0]
basis = count_basis.upper()
if basis not in {"CID", "SID"}:
raise ValueError("count_basis must be 'CID' or 'SID'")
# Preferred schema (what you are seeing)
if basis == "CID" and "CIDCountAll" in summary:
return {
"Basis": "CID",
"Total": summary.get("CIDCountAll", 0),
"Active": summary.get("CIDCountActive", 0),
"Inactive": summary.get("CIDCountInactive", 0),
"Inconclusive": summary.get("CIDCountInconclusive", 0),
"Unspecified": summary.get("CIDCountUnspecified", 0),
"Probe": summary.get("CIDCountProbe", 0),
}
if basis == "SID" and "SIDCountAll" in summary:
return {
"Basis": "SID",
"Total": summary.get("SIDCountAll", 0),
"Active": summary.get("SIDCountActive", 0),
"Inactive": summary.get("SIDCountInactive", 0),
"Inconclusive": summary.get("SIDCountInconclusive", 0),
"Unspecified": summary.get("SIDCountUnspecified", 0),
"Probe": summary.get("SIDCountProbe", 0),
}
# Fallback schema (less common, but we handle it)
# Try a couple of plausible alternatives if PubChem changes formatting.
alt_map = {
"Total": ["TotalCount", "CIDCount", "SIDCount"],
"Active": ["ActiveCount"],
"Inactive": ["InactiveCount"],
"Inconclusive": ["InconclusiveCount"],
}
out = {"Basis": basis}
for k, candidates in alt_map.items():
out[k] = 0
for c in candidates:
if c in summary:
out[k] = summary.get(c, 0)
break
return out
counts_cid = get_assay_outcome_counts(743139, "CID")
counts_sid = get_assay_outcome_counts(743139, "SID")
print("CID counts:", counts_cid)
print("SID counts:", counts_sid)
CID counts: {'Basis': 'CID', 'Total': 8099, 'Active': 326, 'Inactive': 6114, 'Inconclusive': 2092, 'Unspecified': 0, 'Probe': 0}
SID counts: {'Basis': 'SID', 'Total': 10486, 'Active': 379, 'Inactive': 7562, 'Inconclusive': 2545, 'Unspecified': 0, 'Probe': 0}
import requests
def get_assay_summary(aid):
"""
Retrieve assay name and outcome counts (CID and SID)
for a PubChem BioAssay AID.
"""
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{aid}/summary/JSON"
r = requests.get(url, timeout=30)
r.raise_for_status()
s = r.json()["AssaySummaries"]["AssaySummary"][0]
return {
"AID": aid,
"AssayName": s.get("Name", ""),
# CID counts (preferred for ML)
"CID_Total": s.get("CIDCountAll", 0),
"CID_Active": s.get("CIDCountActive", 0),
"CID_Inactive": s.get("CIDCountInactive", 0),
"CID_Inconclusive": s.get("CIDCountInconclusive", 0),
# SID counts (for completeness)
"SID_Total": s.get("SIDCountAll", 0),
"SID_Active": s.get("SIDCountActive", 0),
"SID_Inactive": s.get("SIDCountInactive", 0),
"SID_Inconclusive": s.get("SIDCountInconclusive", 0),
}
import pandas as pd
candidate_aids = [
743040, # ERα agonist
743077, # ERα antagonist
720552, # AR antagonist
720551, # AR agonist
743122, # AhR activation
743065, # PPARγ
743067, # PPARδ
743053, # NRF2 antioxidant response
]
rows = []
for aid in candidate_aids:
try:
rows.append(get_assay_summary(aid))
except Exception as e:
rows.append({
"AID": aid,
"AssayName": "ERROR",
"Error": str(e)
})
df = pd.DataFrame(rows)
df
| AID | AssayName | CID_Total | CID_Active | CID_Inactive | CID_Inconclusive | SID_Total | SID_Active | SID_Inactive | SID_Inconclusive | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 743040 | qHTS assay to identify small molecule agonists... | 8099 | 341 | 7706 | 201 | 10486 | 461 | 9812 | 213 |
| 1 | 743077 | qHTS assay to identify small molecule agonists... | 8099 | 438 | 7303 | 736 | 10486 | 589 | 9089 | 808 |
| 2 | 720552 | qHTS assay for small molecule agonists of the ... | 8099 | 527 | 6960 | 816 | 10488 | 659 | 8888 | 941 |
| 3 | 720551 | qHTS for Inhibitors of KCHN2 3.1: Wildtype qHTS | 343666 | 1267 | 341674 | 730 | 343909 | 1267 | 341912 | 730 |
| 4 | 743122 | qHTS assay to identify small molecule that act... | 8099 | 875 | 6397 | 1357 | 10486 | 1063 | 7945 | 1478 |
| 5 | 743065 | qHTS assay to identify small molecule antagoni... | 8099 | 1536 | 5937 | 912 | 10486 | 1880 | 7568 | 1038 |
| 6 | 743067 | qHTS assay to identify small molecule antagoni... | 8099 | 378 | 5631 | 2491 | 10486 | 426 | 7088 | 2972 |
| 7 | 743053 | qHTS assay to identify small molecule agonists... | 8099 | 260 | 7168 | 918 | 10486 | 372 | 9070 | 1044 |
df["CID_ActiveFraction"] = df["CID_Active"] / df["CID_Total"]
df["CID_InconclusiveFraction"] = df["CID_Inconclusive"] / df["CID_Total"]
df[[
"AID",
"AssayName",
"CID_Total",
"CID_Active",
"CID_ActiveFraction",
"CID_InconclusiveFraction"
]].sort_values("CID_ActiveFraction", ascending=False)
| AID | AssayName | CID_Total | CID_Active | CID_ActiveFraction | CID_InconclusiveFraction | |
|---|---|---|---|---|---|---|
| 5 | 743065 | qHTS assay to identify small molecule antagoni... | 8099 | 1536 | 0.189653 | 0.112606 |
| 4 | 743122 | qHTS assay to identify small molecule that act... | 8099 | 875 | 0.108038 | 0.167552 |
| 2 | 720552 | qHTS assay for small molecule agonists of the ... | 8099 | 527 | 0.065070 | 0.100753 |
| 1 | 743077 | qHTS assay to identify small molecule agonists... | 8099 | 438 | 0.054081 | 0.090875 |
| 6 | 743067 | qHTS assay to identify small molecule antagoni... | 8099 | 378 | 0.046672 | 0.307569 |
| 0 | 743040 | qHTS assay to identify small molecule agonists... | 8099 | 341 | 0.042104 | 0.024818 |
| 7 | 743053 | qHTS assay to identify small molecule agonists... | 8099 | 260 | 0.032103 | 0.113347 |
| 3 | 720551 | qHTS for Inhibitors of KCHN2 3.1: Wildtype qHTS | 343666 | 1267 | 0.003687 | 0.002124 |
def suitability(row):
if row["CID_Total"] < 5000:
return "Too Small"
if row["CID_Active"] < 200:
return "Too Few Actives"
if row["CID_ActiveFraction"] < 0.01:
return "Advanced / Highly Imbalanced"
if row["CID_ActiveFraction"] < 0.05:
return "Intermediate"
return "Intro-Friendly"
df["SuggestedUse"] = df.apply(suitability, axis=1)
df
| AID | AssayName | CID_Total | CID_Active | CID_Inactive | CID_Inconclusive | SID_Total | SID_Active | SID_Inactive | SID_Inconclusive | CID_ActiveFraction | CID_InconclusiveFraction | SuggestedUse | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 743040 | qHTS assay to identify small molecule agonists... | 8099 | 341 | 7706 | 201 | 10486 | 461 | 9812 | 213 | 0.042104 | 0.024818 | Intermediate |
| 1 | 743077 | qHTS assay to identify small molecule agonists... | 8099 | 438 | 7303 | 736 | 10486 | 589 | 9089 | 808 | 0.054081 | 0.090875 | Intro-Friendly |
| 2 | 720552 | qHTS assay for small molecule agonists of the ... | 8099 | 527 | 6960 | 816 | 10488 | 659 | 8888 | 941 | 0.065070 | 0.100753 | Intro-Friendly |
| 3 | 720551 | qHTS for Inhibitors of KCHN2 3.1: Wildtype qHTS | 343666 | 1267 | 341674 | 730 | 343909 | 1267 | 341912 | 730 | 0.003687 | 0.002124 | Advanced / Highly Imbalanced |
| 4 | 743122 | qHTS assay to identify small molecule that act... | 8099 | 875 | 6397 | 1357 | 10486 | 1063 | 7945 | 1478 | 0.108038 | 0.167552 | Intro-Friendly |
| 5 | 743065 | qHTS assay to identify small molecule antagoni... | 8099 | 1536 | 5937 | 912 | 10486 | 1880 | 7568 | 1038 | 0.189653 | 0.112606 | Intro-Friendly |
| 6 | 743067 | qHTS assay to identify small molecule antagoni... | 8099 | 378 | 5631 | 2491 | 10486 | 426 | 7088 | 2972 | 0.046672 | 0.307569 | Intermediate |
| 7 | 743053 | qHTS assay to identify small molecule agonists... | 8099 | 260 | 7168 | 918 | 10486 | 372 | 9070 | 1044 | 0.032103 | 0.113347 | Intermediate |