Skip to content

NPAtlas analysis notebook

path = "../../notebooks-output/npatlas/npatlas.json"
import json
with open(path, 'r') as f:
    objs = json.load(f)

len(objs)
36454
assert all("smiles" in obj for obj in objs)
distinct_structures = set(obj["smiles"] for obj in objs)
len(distinct_structures)
36434

import re
from copy import copy


def flatten_entry(entry: dict):
    flattened = copy(entry)
    #flattened["classyfire_class_name"] = entry["classyfire"]["class"]["name"]
    cf = entry.get("classyfire", {})
    if not cf:
        cf = {}
    if True:
        cf_class = cf.get("class", {})
        if cf_class:
            flattened["classyfire_class_name"] = cf_class.get("name")
        cf_subclass = cf.get("subclass", {})
        if cf_subclass:
            flattened["classyfire_subclass_name"] = cf_subclass.get("name")
        cf_superclass = cf.get("superclass", {})
        if cf_superclass:
            flattened["classyfire_superclass_name"] = cf_superclass.get("name")
        chebi_terms = cf.get("predicted_chebi_terms", [])
        entry["chebi_terms_flat"] = "; ".join(chebi_terms)
        chebi_term_ix = {}
        for t in chebi_terms:
            for k, v in re.findall(r"(.*) \((CHEBI:\d+)\)", t):
                chebi_term_ix[k] = v
        entry["predicted_chebi_index"] = chebi_term_ix
        entry["predicted_chebi_ids"] = tuple(chebi_term_ix.keys())
        entry["predicted_chebi_labels"] = tuple(chebi_term_ix.values())
        lipidmaps_terms = cf.get("predicted_lipidmaps_terms", [])
        entry["lipidmaps_terms_flat"] = "; ".join(lipidmaps_terms)
        lipidmaps_term_ix = {}
        for t in lipidmaps_terms:
            for k, v in re.findall(r"(.*) \((FA\d+)\)", t):
                lipidmaps_term_ix[k] = v
        entry["predicted_lipidmaps_index"] = lipidmaps_term_ix
        entry["predicted_lipidmaps_ids"] = tuple(lipidmaps_term_ix.keys())
        entry["predicted_lipidmaps_labels"] = tuple(lipidmaps_term_ix.values())
    origin_organism = entry.get("origin_organism", {})
    entry["origin_organism_species"] = f"{origin_organism['genus']} {origin_organism['species']}"
    entry["origin_organism_genus"] = origin_organism['genus']
    entry["origin_organism_type"] = origin_organism['type']

    formula = entry.get("mol_formula")
    # split formula string, e.g. "C10H12O5" -> {"C": 10, "H": 12, "O": 5}
    if formula:
        flattened.update({k: int(v) for k, v in re.findall(r"([A-Z][a-z]*)(\d+)", formula)})
    oref = entry.get("origin_reference", {})
    flattened["doi"] = oref.get("doi")
    flattened["pmid"] = oref.get("pmid")

    return flattened

flattened_entries = [flatten_entry(obj) for obj in objs]
len(flattened_entries)
36454
entries_no_cf = [e for e in flattened_entries if "classyfire_class_name" not in e or not e["classyfire_class_name"]]
len(entries_no_cf)
793
len([e for e in flattened_entries if "classyfire_subclass_name" not in e or not e["classyfire_subclass_name"]])
7264
import pandas as pd
entries_df = pd.DataFrame(flattened_entries)
entries_df
id npaid original_name mol_formula mol_weight exact_mass inchikey smiles cluster_id node_id ... doi pmid N S Cl Br I P Se Fe
0 1 NPA000001 Curvularide C C19H37NO5 359.5070 359.2672 BZLIDAVUQDTJQF-HWTFSWDCSA-N CC[C@H](C)[C@@H](CO)NC(=O)/C=C/[C@](C)([C@H]([... 1 1 ... 10.1002/chem.201000652 20680940.0 NaN NaN NaN NaN NaN NaN NaN NaN
1 2 NPA000002 Homopetasinic acid C24H32O6 416.5140 416.2199 MMWCHIIGAIJQMV-DHCANEKFSA-N C[C@H]1[C@@H](CCC2=CC(=O)[C@@H](C[C@]12C)C(=C)... 2 2 ... 10.1016/j.tetlet.2016.01.095 NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 3 NPA000003 A-503083 F C18H22N4O13 502.3890 502.1183 RNRCUOCMUNIOMZ-UHFFFAOYSA-N COC1C(C(OC1C(C(=O)N)OC2C(C(C=C(O2)C(=O)O)O)O)N... 3 3 ... 10.7164/antibiotics.57.639 15638324.0 4.0 NaN NaN NaN NaN NaN NaN NaN
3 4 NPA000004 Aqabamycin E2 C16H11N3O5 325.2800 325.0699 NMMDNCZQLOFGES-UHFFFAOYSA-N C1=CC=C(C=C1)C2=C(NC(=C2C3=CC(=C(C=C3)O)[N+](=... 4 4 ... 10.1038/ja.2010.34 20431617.0 3.0 NaN NaN NaN NaN NaN NaN NaN
4 5 NPA000005 Hymenopsin A C22H32O6 392.4920 392.2199 OYLVOLOSQHRPLK-WRXMSMRBSA-N C[C@]1(CCC[C@]2([C@H]1CC=C3[C@@H]2C[C@]45C(O4)... 5 5 ... 10.1021/np900613d 19928955.0 NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
36449 36718 NPA036718 Julichrome Q3-3 C38H38O16 750.7060 750.2160 FCANTWZHYAQEKU-DLUJSSSPSA-N CC(=O)O[C@H](C)[C@H]1[C@@](C)(O)CC(=O)[C@]23O[... 1472 1213 ... 10.1021/ja501630w 24746278.0 NaN NaN NaN NaN NaN NaN NaN NaN
36450 36719 NPA036719 GE81112A C24H34ClN9O10 644.0420 643.2117 MPRVLYUMDIXCGD-UHFFFAOYSA-N NC(=O)OCC(O)CC(NC(=O)C1NCCCC1O)C(=O)NC(CC1=CN=... 2001 1606 ... 10.1021/bi052540k 16533052.0 9.0 NaN NaN NaN NaN NaN NaN NaN
36451 36723 NPA036723 Subvellerolactone C C17H26O5 310.3900 310.1780 KJKLEJNFKHFYGR-PDROMXSDSA-N CC[C@]1(O)OC(=O)C2=C1[C@H](O)[C@@H]1CC(C)(C)C[... 10508 28 ... 10.5281/zenodo.13381755 NaN NaN NaN NaN NaN NaN NaN NaN NaN
36452 36726 NPA036726 10E-cyclonerotriol C15H28O3 256.3860 256.2038 QGUPPGVBDCWDSK-BSYVWGKESA-N C/C(=C\CC[C@@](C)(O)[C@@H]1CC[C@@](C)(O)[C@H]1... 79 74 ... 10.1039/p19750001586 NaN NaN NaN NaN NaN NaN NaN NaN NaN
36453 36727 NPA036727 Epicyclonerodiol Oxide C15H28O3 256.3860 256.2038 CTTSYRDQSMAGNT-PGKPSXLWSA-N C[C@H]1[C@H]([C@@]2(C)CC[C@H](C(C)(C)O)O2)CC[C... 1110 28 ... 10.1248/cpb.32.4419 NaN NaN NaN NaN NaN NaN NaN NaN NaN

36454 rows × 50 columns

entries_df[entries_df["chebi_terms_flat"] == ""]
id npaid original_name mol_formula mol_weight exact_mass inchikey smiles cluster_id node_id ... doi pmid N S Cl Br I P Se Fe
459 463 NPA000463 N-methylwelwitindolinone D isonitrile C22H20N2O4 376.4120 376.1423 UIYNCSYDIUPTBP-VQKSPFJLSA-N C[C@]1(C(=O)[C@H]2[C@H]3C(=O)[C@@]1(C4=C5C(=CC... 360 325 ... 10.1021/np980485t 10217710.0 2.0 NaN NaN NaN NaN NaN NaN NaN
788 795 NPA000795 3-hydroxy-N-methylwelwitindolinone C isonitrile C22H21ClN2O3 396.8740 396.1241 GFNPBZSGZFQTJA-WOHBTAIZSA-N C[C@]1(C(=C[C@H]2C(=O)[C@@]1(C3=C4C(=CC=C3)N(C... 567 506 ... 10.1021/np980485t 10217710.0 2.0 NaN NaN NaN NaN NaN NaN NaN
3363 3400 NPA003400 12-epi-hapalindole J isonitrile C21H24N2 304.4370 304.1939 SLUFHMQYBPOTFZ-XJRBWHPUSA-N C[C@]1(CC[C@H]2[C@@H]([C@H]1[N+]#[C-])C3=CNC4=... 719 628 ... 10.1016/j.phytochem.2007.06.024 17686499.0 2.0 NaN NaN NaN NaN NaN NaN NaN
4838 4901 NPA004901 12-epi-Fischerindole I isonitrile C21H21ClN2 336.8660 336.1393 ZVBCZYGMKQOJFW-QDUSTFBWSA-N C[C@]1([C@@H](C[C@H]2C(=C1[N+]#[C-])C3=C(C2(C)... 2474 415 ... 10.1021/ja00101a015 NaN 2.0 NaN NaN NaN NaN NaN NaN NaN
5410 5476 NPA005476 methoxy-xanthocillin X dimethylether C21H18N2O3 346.3860 346.1317 GTCYCSHLUXYSAO-BKHHGCLFSA-N COC1=CC=C(C=C1)/C=C(/C(=C/C2=CC(=C(C=C2)OC)OC)... 1475 1215 ... 10.7164/antibiotics.21.671 4304616.0 2.0 NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
36449 36718 NPA036718 Julichrome Q3-3 C38H38O16 750.7060 750.2160 FCANTWZHYAQEKU-DLUJSSSPSA-N CC(=O)O[C@H](C)[C@H]1[C@@](C)(O)CC(=O)[C@]23O[... 1472 1213 ... 10.1021/ja501630w 24746278.0 NaN NaN NaN NaN NaN NaN NaN NaN
36450 36719 NPA036719 GE81112A C24H34ClN9O10 644.0420 643.2117 MPRVLYUMDIXCGD-UHFFFAOYSA-N NC(=O)OCC(O)CC(NC(=O)C1NCCCC1O)C(=O)NC(CC1=CN=... 2001 1606 ... 10.1021/bi052540k 16533052.0 9.0 NaN NaN NaN NaN NaN NaN NaN
36451 36723 NPA036723 Subvellerolactone C C17H26O5 310.3900 310.1780 KJKLEJNFKHFYGR-PDROMXSDSA-N CC[C@]1(O)OC(=O)C2=C1[C@H](O)[C@@H]1CC(C)(C)C[... 10508 28 ... 10.5281/zenodo.13381755 NaN NaN NaN NaN NaN NaN NaN NaN NaN
36452 36726 NPA036726 10E-cyclonerotriol C15H28O3 256.3860 256.2038 QGUPPGVBDCWDSK-BSYVWGKESA-N C/C(=C\CC[C@@](C)(O)[C@@H]1CC[C@@](C)(O)[C@H]1... 79 74 ... 10.1039/p19750001586 NaN NaN NaN NaN NaN NaN NaN NaN NaN
36453 36727 NPA036727 Epicyclonerodiol Oxide C15H28O3 256.3860 256.2038 CTTSYRDQSMAGNT-PGKPSXLWSA-N C[C@H]1[C@H]([C@@]2(C)CC[C@H](C(C)(C)O)O2)CC[C... 1110 28 ... 10.1248/cpb.32.4419 NaN NaN NaN NaN NaN NaN NaN NaN NaN

5589 rows × 50 columns

entries_df[entries_df["lipidmaps_terms_flat"] != ""]
id npaid original_name mol_formula mol_weight exact_mass inchikey smiles cluster_id node_id ... doi pmid N S Cl Br I P Se Fe
0 1 NPA000001 Curvularide C C19H37NO5 359.5070 359.2672 BZLIDAVUQDTJQF-HWTFSWDCSA-N CC[C@H](C)[C@@H](CO)NC(=O)/C=C/[C@](C)([C@H]([... 1 1 ... 10.1002/chem.201000652 20680940.0 NaN NaN NaN NaN NaN NaN NaN NaN
1 2 NPA000002 Homopetasinic acid C24H32O6 416.5140 416.2199 MMWCHIIGAIJQMV-DHCANEKFSA-N C[C@H]1[C@@H](CCC2=CC(=O)[C@@H](C[C@]12C)C(=C)... 2 2 ... 10.1016/j.tetlet.2016.01.095 NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 5 NPA000005 Hymenopsin A C22H32O6 392.4920 392.2199 OYLVOLOSQHRPLK-WRXMSMRBSA-N C[C@]1(CCC[C@]2([C@H]1CC=C3[C@@H]2C[C@]45C(O4)... 5 5 ... 10.1021/np900613d 19928955.0 NaN NaN NaN NaN NaN NaN NaN NaN
6 7 NPA000007 Chaetoxanthone A C20H18O7 370.3570 370.1053 PYEDKAHYOPGAKC-PDXJJUDESA-N C[C@]12C[C@@H](C[C@H](O1)C3=C(O2)C=C4C(=C3O)C(... 7 7 ... 10.1021/np800294q 18683985.0 NaN NaN NaN NaN NaN NaN NaN NaN
7 8 NPA000008 Dihydroxydione 13 C23H35NO4S 421.6030 421.2287 MAZMBUSHGWINCC-RHTYRPEXSA-N CCC(=O)[C@H](C)[C@H]([C@@H](C)C(=O)CC/C(=C\CC(... 8 8 ... 10.1021/np030218+ 14575429.0 NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
35904 36172 NPA036172 Asperthrin A C26H27N3O4 445.5190 445.2002 ALXLLOQTGLPCFG-FMLDBKJISA-N CC1(C)C=CC2=C(C=CC3=C2[N+]([O-])=C2C3=C[C@]34N... 193 181 ... 10.3390/md19030157 NaN 3.0 NaN NaN NaN NaN NaN NaN NaN
35922 36190 NPA036190 Pyroglutamylleucine methyl ester C12H20N2O4 256.3020 256.1423 NGUSEYMDYQKQNZ-UHFFFAOYSA-N COC(=O)C(CC(C)C)NC(=O)C1CCC(=O)N1 10353 6873 ... 10.3390/md19040224 NaN 2.0 NaN NaN NaN NaN NaN NaN NaN
36016 36284 NPA036284 5-hydroxy-7-(2′-hydroxypropyl)-2-methyl-chromone C13H14O4 234.2510 234.0892 XTCMHNQMDBEEHF-UHFFFAOYSA-N CC1=CC(=O)C2=C(O)C=C(CC(C)O)C=C2O1 511 458 ... 10.1007/s12272-022-01370-w 35094261.0 NaN NaN NaN NaN NaN NaN NaN NaN
36131 36399 NPA036399 Aspergillamide F C27H32N4O3 460.5780 460.2474 AOHSWRAYNKXWIR-ZDXLHETRSA-N CC(=O)N[C@H](C(=O)N(C)[C@@H](CC1=CC=CC=C1)C(=O... 62 58 ... 10.3390/molecules27249066 36558198.0 4.0 NaN NaN NaN NaN NaN NaN NaN
36276 36544 NPA036544 Penicibisabolane F C15H22O5 282.3360 282.1467 ISHXRANDGDVGJS-BMIGLBTASA-N C[C@@H](CO)CCC[C@](C)(O)C1=C(O)C=C(C(=O)O)C=C1 1374 1156 ... 10.1002/cbdv.202200178 35452170.0 NaN NaN NaN NaN NaN NaN NaN NaN

17240 rows × 50 columns

entries_df[["C", "classyfire_class_name",  "classyfire_subclass_name", "chebi_terms_flat"]].value_counts()
C      classyfire_class_name             classyfire_subclass_name              chebi_terms_flat                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
15.0   Prenol lipids                     Sesquiterpenoids                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     165
30.0   Prenol lipids                     Triterpenoids                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         40
20.0   Prenol lipids                     Diterpenoids                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          38
16.0   Prenol lipids                     Sesquiterpenoids                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      36
20.0   Carboxylic acids and derivatives  Amino acids, peptides, and analogues                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  32
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             ... 
19.0   Lactones                          Delta valerolactones                  oxanes (CHEBI:46942); secondary alcohol (CHEBI:35681); ketone (CHEBI:17087); organic hydroxy compound (CHEBI:33822); carboxylic ester (CHEBI:33308); oxacycle (CHEBI:38104); carbonyl compound (CHEBI:36586); ether (CHEBI:25698); organic oxide (CHEBI:25701); organic molecule (CHEBI:72695); delta-lactone (CHEBI:18946); chemical entity (CHEBI:24431); organic heterocyclic compound (CHEBI:24532); oxygen molecular entity (CHEBI:25806); organic molecular entity (CHEBI:50860); organooxygen compound (CHEBI:36963); polyol (CHEBI:26191); alcohol (CHEBI:30879); lactone (CHEBI:25000)                                                                                                                                                                  1
                                                                               oxanes (CHEBI:46942); enoate ester (CHEBI:51702); secondary alcohol (CHEBI:35681); ketone (CHEBI:17087); oxacycle (CHEBI:38104); monocarboxylic acid (CHEBI:25384); organic oxide (CHEBI:25701); organic molecule (CHEBI:72695); delta-lactone (CHEBI:18946); chemical entity (CHEBI:24431); organic heterocyclic compound (CHEBI:24532); organic acid (CHEBI:64709); carboxylic acid (CHEBI:33575); organooxygen compound (CHEBI:36963); carboxylic ester (CHEBI:33308); alpha,beta-unsaturated carboxylic ester (CHEBI:51737); oxygen molecular entity (CHEBI:25806); organic molecular entity (CHEBI:50860); polyol (CHEBI:26191); organic hydroxy compound (CHEBI:33822); alcohol (CHEBI:30879); carbonyl compound (CHEBI:36586); lactone (CHEBI:25000)      1
                                                                               oxanes (CHEBI:46942); dicarboxylic acid (CHEBI:35692); enoate ester (CHEBI:51702); secondary alcohol (CHEBI:35681); oxacycle (CHEBI:38104); carboxylic acid (CHEBI:33575); carboxylic acid anion (CHEBI:29067); organic oxide (CHEBI:25701); organic molecule (CHEBI:72695); carbonyl compound (CHEBI:36586); delta-lactone (CHEBI:18946); chemical entity (CHEBI:24431); organic heterocyclic compound (CHEBI:24532); organooxygen compound (CHEBI:36963); carboxylic ester (CHEBI:33308); alpha,beta-unsaturated carboxylic ester (CHEBI:51737); oxygen molecular entity (CHEBI:25806); organic molecular entity (CHEBI:50860); polyol (CHEBI:26191); organic hydroxy compound (CHEBI:33822); alcohol (CHEBI:30879); lactone (CHEBI:25000)                     1
                                                                               oxanes (CHEBI:46942); dicarboxylic acid (CHEBI:35692); carbonyl compound (CHEBI:36586); secondary alcohol (CHEBI:35681); cyclic ketone (CHEBI:3992); organic hydroxy compound (CHEBI:33822); carboxylic ester (CHEBI:33308); oxacycle (CHEBI:38104); organic oxide (CHEBI:25701); organic molecule (CHEBI:72695); delta-lactone (CHEBI:18946); chemical entity (CHEBI:24431); organic heterocyclic compound (CHEBI:24532); organooxygen compound (CHEBI:36963); oxygen molecular entity (CHEBI:25806); organic molecular entity (CHEBI:50860); polyol (CHEBI:26191); alcohol (CHEBI:30879); ketone (CHEBI:17087); lactone (CHEBI:25000)                                                                                                                          1
201.0  Carboxylic acids and derivatives  Amino acids, peptides, and analogues                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   1
Name: count, Length: 21915, dtype: int64
entries_df[["C", "classyfire_class_name",  "classyfire_subclass_name", "chebi_terms_flat", "lipidmaps_terms_flat"]].value_counts()
C      classyfire_class_name             classyfire_subclass_name                  chebi_terms_flat                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            lipidmaps_terms_flat
15.0   Prenol lipids                     Sesquiterpenoids                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              165
30.0   Prenol lipids                     Triterpenoids                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  40
20.0   Prenol lipids                     Diterpenoids                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   38
16.0   Prenol lipids                     Sesquiterpenoids                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               36
20.0   Carboxylic acids and derivatives  Amino acids, peptides, and analogues                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           32
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      ... 
19.0   Indoles and derivatives           Pyridoindoles                             pyridoindole (CHEBI:48888); indoles (CHEBI:24828); imidazopyridine (CHEBI:46908); organic aromatic compound (CHEBI:33659); hydroxylamines (CHEBI:24709); pyridines (CHEBI:26421); carboxamide (CHEBI:37622); imidazolines (CHEBI:53095); carboximidic acid (CHEBI:48378); organic molecular entity (CHEBI:50860); organonitrogen compound (CHEBI:35352); lactam (CHEBI:24995); dipolar compound (CHEBI:51151); polyol (CHEBI:26191); enol (CHEBI:33823); organonitrogen heterocyclic compound (CHEBI:38101); pnictogen molecular entity (CHEBI:33302); organic oxide (CHEBI:25701); organic molecule (CHEBI:72695); carbonyl compound (CHEBI:36586); chemical entity (CHEBI:24431); organic heterocyclic compound (CHEBI:24532); benzenoid aromatic compound (CHEBI:33836); organooxygen compound (CHEBI:36963); amide (CHEBI:32988); nitrogen molecular entity (CHEBI:51143); oxygen molecular entity (CHEBI:25806); organic hydroxy compound (CHEBI:33822); alcohol (CHEBI:30879)                                                                                                   1
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         1
                                         N-alkylindoles                            indoles (CHEBI:24828); pyrroles (CHEBI:26455); benzenoid aromatic compound (CHEBI:33836); organic aromatic compound (CHEBI:33659); carboximidic acid (CHEBI:48378); dipolar compound (CHEBI:51151); organonitrogen heterocyclic compound (CHEBI:38101); primary alcohol (CHEBI:15734); pnictogen molecular entity (CHEBI:33302); organic molecular entity (CHEBI:50860); organonitrogen compound (CHEBI:35352); organic molecule (CHEBI:72695); chemical entity (CHEBI:24431); organic heterocyclic compound (CHEBI:24532); organooxygen compound (CHEBI:36963); oxygen molecular entity (CHEBI:25806); polyol (CHEBI:26191); organic hydroxy compound (CHEBI:33822); alcohol (CHEBI:30879); nitrogen molecular entity (CHEBI:51143)                                                                                                                                                                                                                                                                                                                                                  1
                                         Indolyl carboxylic acids and derivatives  tetralins (CHEBI:36786); isoindoles (CHEBI:24897); indoles (CHEBI:24828); N-acylpyrrolidine (CHEBI:46766); aromatic ketone (CHEBI:76224); secondary amine (CHEBI:32863); pyrrolidin-2-ones (CHEBI:74223); dicarboximide (CHEBI:35356); carbonyl compound (CHEBI:36586); enone (CHEBI:51689); enamine (CHEBI:47989); carboxylic ester (CHEBI:33308); lactam (CHEBI:24995); organonitrogen compound (CHEBI:35352); organooxygen compound (CHEBI:36963); organonitrogen heterocyclic compound (CHEBI:38101); pnictogen molecular entity (CHEBI:33302); organic molecular entity (CHEBI:50860); organic oxide (CHEBI:25701); organic molecule (CHEBI:72695); indolyl carboxylic acid (CHEBI:46867); chemical entity (CHEBI:24431); benzenoid aromatic compound (CHEBI:33836); organic heterocyclic compound (CHEBI:24532); pyrrolidines (CHEBI:38260); oxygen molecular entity (CHEBI:25806); ketone (CHEBI:17087); nitrogen molecular entity (CHEBI:51143); amine (CHEBI:32952); pyrrolidinone (CHEBI:38275); amino acid (CHEBI:33709); peptide (CHEBI:16670)                            1
201.0  Carboxylic acids and derivatives  Amino acids, peptides, and analogues                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            1
Name: count, Length: 21975, dtype: int64
entries_df[["C", "classyfire_class_name",  "classyfire_subclass_name"]].value_counts()
C      classyfire_class_name             classyfire_subclass_name            
15.0   Prenol lipids                     Sesquiterpenoids                        868
30.0   Prenol lipids                     Triterpenoids                           366
20.0   Prenol lipids                     Diterpenoids                            186
15.0   Organooxygen compounds            Alcohols and polyols                    159
16.0   Benzopyrans                       1-benzopyrans                           156
                                                                                ... 
22.0   Pyridines and derivatives         Bipyridines and oligopyridines            1
       Pteridines and derivatives        Alloxazines and isoalloxazines            1
       Phenols                           Cresols                                   1
                                         1-hydroxy-4-unsubstituted benzenoids      1
201.0  Carboxylic acids and derivatives  Amino acids, peptides, and analogues      1
Name: count, Length: 3834, dtype: int64
entries_df[(entries_df["C"] > 20) & (entries_df["classyfire_subclass_name"] == "Sesquiterpenoids")][["C", "classyfire_class_name",  "classyfire_subclass_name"]].value_counts()
C     classyfire_class_name  classyfire_subclass_name
21.0  Prenol lipids          Sesquiterpenoids            85
23.0  Prenol lipids          Sesquiterpenoids            85
25.0  Prenol lipids          Sesquiterpenoids            75
22.0  Prenol lipids          Sesquiterpenoids            67
29.0  Prenol lipids          Sesquiterpenoids            66
24.0  Prenol lipids          Sesquiterpenoids            52
26.0  Prenol lipids          Sesquiterpenoids            51
27.0  Prenol lipids          Sesquiterpenoids            43
28.0  Prenol lipids          Sesquiterpenoids            41
30.0  Prenol lipids          Sesquiterpenoids            41
31.0  Prenol lipids          Sesquiterpenoids            21
33.0  Prenol lipids          Sesquiterpenoids            16
32.0  Prenol lipids          Sesquiterpenoids            15
39.0  Prenol lipids          Sesquiterpenoids             7
37.0  Prenol lipids          Sesquiterpenoids             5
34.0  Prenol lipids          Sesquiterpenoids             5
36.0  Prenol lipids          Sesquiterpenoids             4
38.0  Prenol lipids          Sesquiterpenoids             4
35.0  Prenol lipids          Sesquiterpenoids             4
40.0  Prenol lipids          Sesquiterpenoids             2
53.0  Prenol lipids          Sesquiterpenoids             2
42.0  Prenol lipids          Sesquiterpenoids             1
43.0  Prenol lipids          Sesquiterpenoids             1
44.0  Prenol lipids          Sesquiterpenoids             1
45.0  Prenol lipids          Sesquiterpenoids             1
46.0  Prenol lipids          Sesquiterpenoids             1
Name: count, dtype: int64
entries_df[(entries_df["C"] >= 40) & (entries_df["doi"].notna()) & (entries_df["classyfire_subclass_name"] == "Sesquiterpenoids")][["C", "classyfire_subclass_name", "doi", "pmid", "original_name", "smiles"]]
C classyfire_subclass_name doi pmid original_name smiles
3864 43.0 Sesquiterpenoids 10.1016/j.tet.2008.11.078 NaN Meleagrin B C[C@@H]1[C@@H](C[C@@]2([C@]3([C@]14C[C@@]5(CCC...
7697 44.0 Sesquiterpenoids 10.1021/np300751m 23305465.0 Albatrelin D CC1=CC(=C(C2=C1C3=C(C(=C4C(=C3O2)C=C[C@@](O4)(...
7708 40.0 Sesquiterpenoids 10.1021/acs.orglett.5b01353 26024438.0 Cochlearoid A CC(=CCC/C(=C/CC/C(=C/CC1=C2C(=CC(=C1)O)C3=C(C=...
11690 45.0 Sesquiterpenoids 10.1021/acs.orglett.5b01356 26068271.0 Sterhirsutin I C=C1C(=O)C(OC(=O)[C@@]2(C)C[C@@H]3CC4=C(OC(=O)...
18411 46.0 Sesquiterpenoids 10.1021/acs.orglett.5b01356 26068271.0 Sterhirsutin H C=C1C(=O)C(OC(=O)[C@@]2(C)C[C@@H]3CC4=C(OC(=O)...
32821 40.0 Sesquiterpenoids 10.1002/chem.202104484 NaN Sandacrabin C CC(C)=CCC/C(C)=C/CC/C(C)=C/CN1C(C)=[N+](C/C=C(...
35303 53.0 Sesquiterpenoids 10.1055/a-1392-1038 33682913.0 Talatrachyoxazine B C=C(C)[C@@H]1CC[C@@H](C)[C@@]2(O)[C@@H]1C=C(C)...
35304 53.0 Sesquiterpenoids 10.1055/a-1392-1038 33682913.0 Talatrachyoxazine C C=C(C)[C@@H]1CC[C@@H](C)[C@@]2(O)[C@@H]1C=C(C)...
35941 42.0 Sesquiterpenoids 10.3390/md19020098 NaN Dinotoamide J C=CC(C)(C)C1(C[C@@H]2NC(=O)[C@@H]3CCCN3C2=O)C(...

C3P Classification

from c3p.classifier import Classifier

c3p_classifier = Classifier()

!mkdir -p npatlas
def classify_all(classifier, structures):
    n = 0
    results = []
    for r in classifier.classify_iter(structures):
        if r.is_match:
            results.append(r)
            n += 1
            if n % 1000 == 0:
                print(n)

    print(len(results))
    return results

from pathlib import Path
import pandas as pd


path = Path("../../notebooks-output/npatlas/c3p_results.csv")
if path.exists():
    c3p_df = pd.read_csv(path)
else:
    c3p_results = classify_all(c3p_classifier, distinct_structures)
    c3p_df = pd.DataFrame( [r.model_dump() for r in c3p_results] )
    c3p_df.to_csv(path)
c3p_df["class_name"].value_counts()
class_name
diterpenoid                            18926
diol                                    8724
phenylpropanoid                         8711
icosanoid                               8411
semisynthetic derivative                5531
                                       ...  
straight-chain saturated fatty acid        1
tetrachlorobenzene                         1
polyprenol phosphate                       1
aliphatic aldoxime                         1
phosphatidylglycerol                       1
Name: count, Length: 269, dtype: int64

CHEBI classification

from pathlib import Path
import pandas as pd


path = Path("../../notebooks-output/npatlas/chebi_results.csv")
if path.exists():
    chebi_df = pd.read_csv(path)
else:
    from c3p.chebi_classifier import ChEBIClassifier
    chebi_classifier = ChEBIClassifier()
    chebi_results = classify_all(chebi_classifier, distinct_structures)
    chebi_df = pd.DataFrame( [r.model_dump() for r in chebi_results] )
    chebi_df.to_csv(path)

#chebi_results = classify_all(chebi_classifier, list(distinct_structures))
#len(chebi_results)
from c3p.clients.chebifier import ChebifierClient

chebifier = ChebifierClient()
chebifier_results = {}
if False:
    for s in distinct_structures:
        if s not in chebifier_results:
            chebifier_results[s] = chebifier.classify(s)
len(chebifier_results) / len(distinct_structures)
0.0
#list(chebifier_results.values())[0]
chebifier_df = pd.DataFrame( [r.model_dump() for rs in chebifier_results.values() for r in rs] )
import pandas as pd
import matplotlib.pyplot as plt

def plot_class_distribution(df, column_name="class_name", title=None, min_count=1000, save_to=None):

    # Count occurrences of each class
    class_counts = df[column_name].value_counts()
    # filter to be at least N
    class_counts = class_counts[class_counts > min_count]

    # Re-plot the bar chart with c3p_classes on y-axis (horizontal bar chart)
    plt.figure(figsize=(8, 14))
    class_counts.plot(kind='barh')
    plt.ylabel(column_name)
    plt.xlabel("Count")
    plt.title(title)
    if save_to:
        plt.savefig(save_to, dpi=300, bbox_inches='tight')
    plt.show()

plot_class_distribution(c3p_df, column_name="class_name", title="Distribution of c3p_classes in NPAtlas",
                        min_count=1400,
                        save_to="../../notebooks-output/npatlas/npatlas-summary.png")
No description has been provided for this image
#plt.savefig("../../notebooks-output/npatlas/summary.png", dpi=300, bbox_inches="tight")
#plot_class_distribution(chebifier_df, column_name="class_name", title="Distribution of chebifier_classes in NPAtlas")
plot_class_distribution(entries_df, column_name="classyfire_subclass_name", title="Distribution of classyfire_subclasses in NPAtlas", min_count=100)
No description has been provided for this image
plot_class_distribution(entries_df, column_name="classyfire_class_name", title="Distribution of classyfire_classes in NPAtlas", min_count=100)
No description has been provided for this image
plot_class_distribution(entries_df, column_name="classyfire_superclass_name", title="Distribution of classyfire_superclasses in NPAtlas", min_count=5)
No description has been provided for this image
def merge_classifications_df(entries_df, df, renamed_column="c3p_class_name"):
    df2 = df.rename(columns={"class_name": renamed_column})
    joined = df2.merge(entries_df, left_on="input_smiles", right_on="smiles")
    # compare the two columns
    return joined


merged_df = merge_classifications_df(entries_df, c3p_df)
merged_df
Unnamed: 0 input_smiles class_id c3p_class_name is_match reason confidence id npaid original_name ... doi pmid N S Cl Br I P Se Fe
0 0 CC(C)CCCCCCCCCCCC(=O)OCC(COP(=O)(O)OCC(COP(=O)... CHEBI:17517 phosphatidylglycerol True Contains glycerol backbone with 2 fatty acid c... 0.913408 825 NPA000825 DPG ... 10.1021/np990313b 10843572.0 NaN NaN NaN NaN NaN 2.0 NaN NaN
1 1 CC(=O)OCCCCC/C=C\C[C@@]\1(C=CC(=O)/C1=C/C=C\[C... CHEBI:36092 clavulone True Contains characteristic clavulone structural f... 0.750000 14701 NPA014701 Not named ... 10.1016/s0040-4039(00)85917-1 NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 2 CCCCC/C=C\CC\1(C=C(C(=O)/C1=C/C=C/CCCC(=O)OC)Cl)O CHEBI:36092 clavulone True Contains characteristic clavulone structural f... 0.750000 5289 NPA005289 Chlorovulone II ... 10.1016/s0040-4039(00)98927-5 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 3 C[C@H](CC1=C([C@@]([C@@H](C1=O)NC(=O)CO)(C(=O)... CHEBI:36092 clavulone True Contains characteristic clavulone structural f... 0.750000 24358 NPA024358 Mccrearamycin D ... 10.1002/anie.201612447 28140487.0 2.0 NaN NaN NaN NaN NaN NaN NaN
4 4 CCCCC/C=C\CC\1(C=C(C(=O)/C1=C\C=C\CCCC(=O)OC)Cl)O CHEBI:36092 clavulone True Contains characteristic clavulone structural f... 0.750000 15407 NPA015407 Chlorovulone III ... 10.1016/s0040-4039(00)98927-5 NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
146946 146878 CC1=C(C2=C(C(=C1Cl)O)C(=O)C3=C(O2)C=C(C=C3C(=O... CHEBI:50753 isoflavonoid True Molecule contains a fused benzopyran core (6-m... 0.278749 8373 NPA008373 Penicillixanthone ... 10.1016/j.tet.2014.05.105 NaN NaN NaN 2.0 NaN NaN NaN NaN NaN
146947 146879 CC1=CC(=CC2=C1C3=CC(=CC(=C3C(=O)O2)O)OS(=O)(=O... CHEBI:50753 isoflavonoid True Molecule contains a fused benzopyran core (6-m... 0.278749 7981 NPA007981 Alternariol 5-O-sulfate ... 10.1021/np070447m 18494522.0 NaN NaN NaN NaN NaN NaN NaN NaN
146948 146880 CC(=CCC1=CC(=C2C(=C1O)C(=O)C3=CC(=CC(=C3O2)O)O... CHEBI:50753 isoflavonoid True Molecule contains a fused benzopyran core (6-m... 0.278749 8026 NPA008026 Umbilicaxanthoside A ... 10.1016/s0031-9422(02)00539-3 NaN NaN NaN NaN NaN NaN NaN NaN NaN
146949 146881 CC1=C(C=C2C(=C1)OC3=C(C=C(C(=C3C2=O)C)C4=CC(=C... CHEBI:50753 isoflavonoid True Molecule contains a fused benzopyran core (6-m... 0.278749 21472 NPA021472 Verrulactone D ... 10.1038/ja.2015.86 26306815.0 NaN NaN NaN NaN NaN NaN NaN NaN
146950 146882 COC1=C2C(=C(C(=C1)C(CO)C(CO)O)OC)OC3=C(C=CC(=C... CHEBI:50753 isoflavonoid True Molecule contains a fused benzopyran core (6-m... 0.278749 1570 NPA001570 Dalienxanthone C ... 10.3987/com-14-13136 NaN NaN NaN NaN NaN NaN NaN NaN NaN

146951 rows × 57 columns

merged_df[["classyfire_subclass_name", "c3p_class_name"]].value_counts()
classyfire_subclass_name              c3p_class_name       
Amino acids, peptides, and analogues  diterpenoid              1506
Sesquiterpenoids                      diterpenoid              1365
Amino acids, peptides, and analogues  icosanoid                1265
Depsipeptides                         peptide antibiotic       1216
Amino acids, peptides, and analogues  phenylpropanoid           922
                                                               ... 
Benzoisochromanequinones              tertiary amine oxide        1
Benzophenones                         11-oxo steroid              1
                                      2,5-diketopiperazines       1
Fatty acid esters                     organic sulfide             1
Amines                                aralkylamine                1
Name: count, Length: 7044, dtype: int64
merged_df[["classyfire_class_name", "c3p_class_name"]].value_counts()
classyfire_class_name             c3p_class_name                
Prenol lipids                     diterpenoid                       4180
Carboxylic acids and derivatives  diterpenoid                       1827
Organooxygen compounds            diterpenoid                       1688
Peptidomimetics                   peptide antibiotic                1634
Prenol lipids                     icosanoid                         1605
                                                                    ... 
Macrolides and analogues          alpha-amino acid                     1
                                  2-oxo monocarboxylic acid            1
Macrolide lactams                 secondary alpha-hydroxy ketone       1
                                  acetate ester                        1
Vinylogous esters                 sesquiterpenoid                      1
Name: count, Length: 5166, dtype: int64
merged_df[["classyfire_superclass_name", "classyfire_class_name", "classyfire_subclass_name", "c3p_class_name"]].value_counts()
classyfire_superclass_name       classyfire_class_name             classyfire_subclass_name                    c3p_class_name        
Organic acids and derivatives    Carboxylic acids and derivatives  Amino acids, peptides, and analogues        diterpenoid               1506
Lipids and lipid-like molecules  Prenol lipids                     Sesquiterpenoids                            diterpenoid               1365
Organic acids and derivatives    Carboxylic acids and derivatives  Amino acids, peptides, and analogues        icosanoid                 1265
                                 Peptidomimetics                   Depsipeptides                               peptide antibiotic        1216
                                 Carboxylic acids and derivatives  Amino acids, peptides, and analogues        phenylpropanoid            922
                                                                                                                                         ... 
Organoheterocyclic compounds     Azolidines                        Isoxazolidines                              phenylpropanoid              1
                                                                                                               secondary alcohol            1
                                                                   Oxazolidines                                aliphatic alcohol            1
Benzenoids                       Phenols                           Benzenetriols and derivatives               organohalogen compound       1
Alkaloids and derivatives        Amaryllidaceae alkaloids          Norbelladine-type amaryllidaceae alkaloids  catechols                    1
Name: count, Length: 7044, dtype: int64
def make_correlation_df(df, column1, column2):
    return merged_df[[column1, column2]].value_counts().reset_index(name="count")

correlation_df = make_correlation_df(merged_df, "classyfire_class_name", "c3p_class_name")
correlation_df
## correlation_df = merged_df.DataFrame([(cls1, cls2) for cls1, cls2 in class_c3p_mapping.items() for c3p in c3p_list], columns=["classes", "c3p_classes"])
classyfire_class_name c3p_class_name count
0 Prenol lipids diterpenoid 4180
1 Carboxylic acids and derivatives diterpenoid 1827
2 Organooxygen compounds diterpenoid 1688
3 Peptidomimetics peptide antibiotic 1634
4 Prenol lipids icosanoid 1605
... ... ... ...
5161 Macrolides and analogues alpha-amino acid 1
5162 Macrolides and analogues 2-oxo monocarboxylic acid 1
5163 Macrolide lactams secondary alpha-hydroxy ketone 1
5164 Macrolide lactams acetate ester 1
5165 Vinylogous esters sesquiterpenoid 1

5166 rows × 3 columns

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

def plot_correlation(df, col1, col2, N=20, M=20, mode='resid', title=None):
    """
    Plots a heatmap for top N categories in col1 vs top M categories in col2,
    allowing different cell-level statistics:
      - 'count': raw observed counts (default)
      - 'contri': cell-wise chi-square contribution
      - 'resid': standardized residuals

    Args:
        df (pd.DataFrame): The input DataFrame.
        col1 (str): Column name for x-axis (rows).
        col2 (str): Column name for y-axis (columns).
        N (int): Number of top categories to keep from col1.
        M (int): Number of top categories to keep from col2.
        mode (str): Which metric to display in heatmap cells:
                    'count', 'contri', or 'resid'.
        title (str): Optional plot title.
    """

    # 1. Identify top N categories in col1
    top_n_categories_col1 = df[col1].value_counts().nlargest(N).index
    # 2. Identify top M categories in col2
    top_m_categories_col2 = df[col2].value_counts().nlargest(M).index

    # 3. Filter the DataFrame
    df_filtered = df[
        df[col1].isin(top_n_categories_col1) &
        df[col2].isin(top_m_categories_col2)
    ]

    # 4. Create a contingency table
    contingency_table = pd.crosstab(df_filtered[col1], df_filtered[col2])

    # 5. Perform the chi-squared test once
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f"Overall chi2 = {chi2:.4f}, p-value = {p:.4e}, dof = {dof}")

    # 6. Decide which cell-level statistic to plot
    if mode == 'count':
        # Raw observed counts
        data_to_plot = contingency_table
        fmt_str = 'd'
        colorbar_label = 'Observed Count'
    elif mode == 'contri':
        # Cell-level chi-square contribution: (O-E)^2 / E
        contrib = (contingency_table - expected) ** 2 / expected
        data_to_plot = pd.DataFrame(contrib, 
                                    index=contingency_table.index,
                                    columns=contingency_table.columns)
        fmt_str = '.2f'
        colorbar_label = 'Chi-square Contribution'
    elif mode == 'resid':
        # Standardized residual: (O-E)/sqrt(E)
        resid = (contingency_table - expected) / np.sqrt(expected)
        data_to_plot = pd.DataFrame(resid, 
                                    index=contingency_table.index,
                                    columns=contingency_table.columns)
        fmt_str = '.1f'
        colorbar_label = 'Standardized Residual'
    else:
        raise ValueError("Invalid mode. Choose from ['count', 'contri', 'resid'].")

    # 7. Plot the heatmap
    plt.figure(figsize=(10, 8))
    # Center=0 is often useful for 'contri' or 'resid' to visually highlight +/- deviance
    center_val = 0 if mode in ('contri', 'resid') else None

    sns.heatmap(
        data_to_plot, 
        annot=True, 
        fmt=fmt_str, 
        cmap='coolwarm' if mode in ('contri','resid') else 'YlGnBu',
        center=center_val
    )

    #plt.figure(figsize=(16, 16))
    plt.title(title if title else f"Top {N} {col1} x Top {M} {col2} [{mode}]")
    plt.xlabel(col2)
    plt.ylabel(col1)
    cbar = plt.gca().collections[0].colorbar
    cbar.set_label(colorbar_label)

    plt.tight_layout()
    plt.show()
    return data_to_plot, 
corrs = plot_correlation(merged_df, "classyfire_class_name", "c3p_class_name", title="Correlation between classyfire and c3p classes")
Overall chi2 = 55174.7285, p-value = 0.0000e+00, dof = 361

No description has been provided for this image
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

def top_significant_pairs(df, col1, col2, N=20, M=20, top_k=20):
    """
    Return a DataFrame of the most positively significant (over-represented) 
    category pairs, based on standardized residuals from the chi-square test.

    1. Filter to top N categories in col1 and top M categories in col2 by frequency.
    2. Build contingency table.
    3. Perform chi2_contingency to get expected counts.
    4. Compute standardized residuals = (O - E) / sqrt(E).
    5. Return a table of pairs sorted by descending standardized residual.

    Args:
        df (pd.DataFrame): input data
        col1 (str): the name of the first categorical column
        col2 (str): the name of the second categorical column
        N (int): keep top N categories in col1
        M (int): keep top M categories in col2
        top_k (int): how many top pairs to return (sorted by largest positive standardized residual)

    Returns:
        pd.DataFrame: A DataFrame with columns:
                      [col1, col2, observed_count, expected_count, std_resid]
    """

    # 1. Identify top N categories in col1
    top_n_categories_col1 = df[col1].value_counts().nlargest(N).index
    # 2. Identify top M categories in col2
    top_m_categories_col2 = df[col2].value_counts().nlargest(M).index

    # 3. Filter the DataFrame to only those categories
    df_filtered = df[
        df[col1].isin(top_n_categories_col1) &
        df[col2].isin(top_m_categories_col2)
    ]

    # 4. Create a contingency table (observed counts)
    contingency_table = pd.crosstab(df_filtered[col1], df_filtered[col2])

    # 5. Perform chi2_contingency to get expected counts
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f"Overall Chi-square: {chi2:.4f}, p-value: {p:.2e}, dof: {dof}")

    # 6. Compute standardized residuals: (O - E) / sqrt(E)
    observed = contingency_table.values
    standardized_residuals = (observed - expected) / np.sqrt(expected)

    # Flatten into a long-format DataFrame of pairs
    rows = []
    row_categories = contingency_table.index
    col_categories = contingency_table.columns

    for i, row_cat in enumerate(row_categories):
        for j, col_cat in enumerate(col_categories):
            obs = observed[i, j]
            exp = expected[i, j]
            sr = standardized_residuals[i, j]
            rows.append({
                col1: row_cat,
                col2: col_cat,
                "observed_count": obs,
                "expected_count": exp,
                "std_resid": sr
            })

    result_df = pd.DataFrame(rows)

    # 7. Sort by descending standardized residual (show most over-represented pairs first)
    result_df.sort_values("std_resid", ascending=False, inplace=True)

    # Optionally take the top_k pairs
    top_pairs = result_df.head(top_k).reset_index(drop=True)

    return top_pairs
top_subclass_pairs = top_significant_pairs(merged_df, "classyfire_subclass_name", "c3p_class_name", N=100, M=100, top_k=100)

top_subclass_pairs.head(100).to_csv("../../notebooks-output/npatlas/top_subclass_pairs.csv", index=False)
top_subclass_pairs
Overall Chi-square: 359493.0291, p-value: 0.00e+00, dof: 9801

classyfire_subclass_name c3p_class_name observed_count expected_count std_resid
0 Depsipeptides peptide antibiotic 1216 112.095649 104.264644
1 Indoles indole alkaloid 300 10.032536 91.546954
2 Anthraquinones quinone 411 23.342672 80.236636
3 Furanones butenolide 232 8.105954 78.639451
4 Tetraterpenoids xanthophyll 62 0.647257 76.259767
... ... ... ... ... ...
95 Glycosphingolipids beta-D-glucoside 36 1.484332 28.330274
96 Lineolic acids and derivatives unsaturated fatty acid 17 0.346554 28.289063
97 Glycosphingolipids beta-D-galactoside 27 0.870323 28.008747
98 Ergostane steroids sterol 171 26.757722 27.884831
99 Ergostane steroids steroid 177 28.699325 27.682626

100 rows × 5 columns

top_pairs = top_significant_pairs(merged_df, "classyfire_class_name", "c3p_class_name", N=100, M=100, top_k=100)
top_pairs.head(100)
Overall Chi-square: 342666.4721, p-value: 0.00e+00, dof: 9801

classyfire_class_name c3p_class_name observed_count expected_count std_resid
0 Macrolides and analogues macrolide 1071 55.489795 136.325698
1 Peptidomimetics peptide antibiotic 1634 148.230787 122.034366
2 Polypeptides polypeptide 189 3.869543 94.112661
3 Dihydrofurans butenolide 232 7.773573 80.422296
4 Polypeptides macromolecule 180 4.905257 79.057347
... ... ... ... ... ...
95 Lactams alpha-amino acid 7 0.063688 27.485270
96 Pyridines and derivatives alkaloid 46 2.550913 27.204003
97 5'-deoxyribonucleosides ribonucleoside 9 0.107544 27.116229
98 Sphingolipids beta-D-galactoside 27 0.928338 27.059239
99 Naphthacenes flavonoids 117 14.559142 26.847590

100 rows × 5 columns

top_pairs.head(100).to_csv("../../notebooks-output/npatlas/top_pairs.csv", index=False)
top_pairs = top_significant_pairs(merged_df, "lipidmaps_terms_flat", "c3p_class_name", N=100, M=100, top_k=100)
top_pairs.head(100)
Overall Chi-square: 247245.0428, p-value: 0.00e+00, dof: 9801

lipidmaps_terms_flat c3p_class_name observed_count expected_count std_resid
0 Macrolides and lactone polyketides (PK04) macrolide 837 56.039221 104.323747
1 Dibenzofurans, griseofulvins, dibenzopyrans an... isoflavonoid 162 4.512112 74.140795
2 N-acyl amines (FA0802); Fatty Acyls (FA); Fatt... oligopeptide 513 44.354278 70.368275
3 Fatty acyl glycosides of mono- and disaccharid... glycolipid 24 0.121503 68.503571
4 Fatty Acids and Conjugates (FA01); Hydroxy fat... hydroxy fatty acid 28 0.172196 67.060619
... ... ... ... ... ...
95 Fatty alcohols (FA05); Macrolides and lactone ... macrolide 29 1.699579 20.941060
96 Sterol Lipids (ST); C30 isoprenoids (triterpen... triterpenoid 94 14.573423 20.805831
97 Fatty acyl glycosides (FA13); Fatty acyl glyco... D-glucoside 16 0.556952 20.693036
98 Bile acids and derivatives (ST04); Sterol Lipi... 3-oxo-Delta(4) steroid 77 10.369066 20.692172
99 Dicarboxylic acids (FA0117); Benzopyranoids (P... biflavonoid 25 1.311538 20.684562

100 rows × 5 columns

top_pairs = top_significant_pairs(merged_df, "origin_organism_species", "c3p_class_name", N=100, M=100, top_k=100)
top_pairs.head(100)
Overall Chi-square: 84346.4108, p-value: 0.00e+00, dof: 9801

origin_organism_species c3p_class_name observed_count expected_count std_resid
0 Fusarium oxysporum methyl-branched fatty acid 24 0.257265 46.810155
1 Aspergillus candidus volatile organic compound 35 0.734127 39.992273
2 Sorangium cellulosum azole 41 1.012482 39.740265
3 Trichoderma harzianum polypeptide 25 0.446498 36.745445
4 Chaetomium globosum TW1-1 indole alkaloid 27 0.670605 32.151961
... ... ... ... ... ...
95 Microcystis sp. lipopeptide 23 2.859393 11.910654
96 Sorangium cellulosum secondary alcohol 39 7.189646 11.863552
97 Xylaria cf. curta alpha-hydroxy ketone 23 2.938288 11.703636
98 Cyanobacterium sp. lipopeptide 18 1.907766 11.650749
99 Microcystis aeruginosa cannabinoid 30 4.720025 11.636020

100 rows × 5 columns

top_pairs = top_significant_pairs(merged_df, "origin_organism_genus", "c3p_class_name", N=100, M=100, top_k=100)
top_pairs.head(100)
Overall Chi-square: 132810.8974, p-value: 0.00e+00, dof: 9801

origin_organism_genus c3p_class_name observed_count expected_count std_resid
0 Ganoderma 3-oxo steroid 338 41.015615 46.372339
1 Microcystis peptide antibiotic 264 27.949692 44.649445
2 Ganoderma steroid 469 85.520406 41.467455
3 Trichoderma polypeptide 115 7.200228 40.173968
4 Guignardia iridoid monoterpenoid 26 0.406736 40.130040
... ... ... ... ... ...
95 Amycolatopsis amino sugar 22 1.664415 15.762527
96 Streptomyces nucleoside 139 39.895418 15.690335
97 Streptosporangium glycolipid 8 0.244379 15.688613
98 Streptoverticillium indole alkaloid 20 1.406963 15.675050
99 Fomitopsis sesterterpenoid 70 13.160791 15.667764

100 rows × 5 columns

top_pairs = top_significant_pairs(merged_df, "origin_organism_type", "c3p_class_name", N=100, M=100, top_k=100)
top_pairs.head(100)
Overall Chi-square: 22660.3361, p-value: 0.00e+00, dof: 99

origin_organism_type c3p_class_name observed_count expected_count std_resid
0 Bacterium peptide antibiotic 2189 910.772121 42.354877
1 Bacterium oligopeptide 1167 510.670283 29.043689
2 Bacterium amine 830 379.193062 23.150501
3 Bacterium mucopolysaccharide 728 332.059719 21.728092
4 Bacterium quinone 1108 588.989597 21.385640
... ... ... ... ... ...
95 Fungus branched-chain fatty acid 189 178.189453 0.809854
96 Bacterium D-glucoside 194 185.698285 0.609206
97 Bacterium enone 192 183.926354 0.595316
98 Fungus saccharolipid 195 188.519277 0.472004
99 Bacterium catechols 133 130.059676 0.257824

100 rows × 5 columns

melt_cols = ['classyfire_class_name', 'classyfire_subclass_name']
merged_df_melted = pd.melt(
    merged_df,
    id_vars=[col for col in merged_df.columns if col not in melt_cols],
    value_vars=melt_cols,
    var_name='classyfire_level',
    value_name='classyfire_term'
)

# Clean up the classyfire_level values to remove the '_name' suffix
merged_df_melted['classyfire_level'] = merged_df_melted['classyfire_level'].str.replace('_name', '')
merged_df_melted
Unnamed: 0 input_smiles class_id c3p_class_name is_match reason confidence id npaid original_name ... N S Cl Br I P Se Fe classyfire_level classyfire_term
0 0 CC(C)CCCCCCCCCCCC(=O)OCC(COP(=O)(O)OCC(COP(=O)... CHEBI:17517 phosphatidylglycerol True Contains glycerol backbone with 2 fatty acid c... 0.913408 825 NPA000825 DPG ... NaN NaN NaN NaN NaN 2.0 NaN NaN classyfire_class Glycerophospholipids
1 1 CC(=O)OCCCCC/C=C\C[C@@]\1(C=CC(=O)/C1=C/C=C\[C... CHEBI:36092 clavulone True Contains characteristic clavulone structural f... 0.750000 14701 NPA014701 Not named ... NaN NaN NaN NaN NaN NaN NaN NaN classyfire_class Fatty Acyls
2 2 CCCCC/C=C\CC\1(C=C(C(=O)/C1=C/C=C/CCCC(=O)OC)Cl)O CHEBI:36092 clavulone True Contains characteristic clavulone structural f... 0.750000 5289 NPA005289 Chlorovulone II ... NaN NaN NaN NaN NaN NaN NaN NaN classyfire_class Fatty Acyls
3 3 C[C@H](CC1=C([C@@]([C@@H](C1=O)NC(=O)CO)(C(=O)... CHEBI:36092 clavulone True Contains characteristic clavulone structural f... 0.750000 24358 NPA024358 Mccrearamycin D ... 2.0 NaN NaN NaN NaN NaN NaN NaN classyfire_class Fatty Acyls
4 4 CCCCC/C=C\CC\1(C=C(C(=O)/C1=C\C=C\CCCC(=O)OC)Cl)O CHEBI:36092 clavulone True Contains characteristic clavulone structural f... 0.750000 15407 NPA015407 Chlorovulone III ... NaN NaN NaN NaN NaN NaN NaN NaN classyfire_class Fatty Acyls
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
293897 146878 CC1=C(C2=C(C(=C1Cl)O)C(=O)C3=C(O2)C=C(C=C3C(=O... CHEBI:50753 isoflavonoid True Molecule contains a fused benzopyran core (6-m... 0.278749 8373 NPA008373 Penicillixanthone ... NaN NaN 2.0 NaN NaN NaN NaN NaN classyfire_subclass 1-benzopyrans
293898 146879 CC1=CC(=CC2=C1C3=CC(=CC(=C3C(=O)O2)O)OS(=O)(=O... CHEBI:50753 isoflavonoid True Molecule contains a fused benzopyran core (6-m... 0.278749 7981 NPA007981 Alternariol 5-O-sulfate ... NaN NaN NaN NaN NaN NaN NaN NaN classyfire_subclass NaN
293899 146880 CC(=CCC1=CC(=C2C(=C1O)C(=O)C3=CC(=CC(=C3O2)O)O... CHEBI:50753 isoflavonoid True Molecule contains a fused benzopyran core (6-m... 0.278749 8026 NPA008026 Umbilicaxanthoside A ... NaN NaN NaN NaN NaN NaN NaN NaN classyfire_subclass 1-benzopyrans
293900 146881 CC1=C(C=C2C(=C1)OC3=C(C=C(C(=C3C2=O)C)C4=CC(=C... CHEBI:50753 isoflavonoid True Molecule contains a fused benzopyran core (6-m... 0.278749 21472 NPA021472 Verrulactone D ... NaN NaN NaN NaN NaN NaN NaN NaN classyfire_subclass Linear diarylheptanoids
293901 146882 COC1=C2C(=C(C(=C1)C(CO)C(CO)O)OC)OC3=C(C=CC(=C... CHEBI:50753 isoflavonoid True Molecule contains a fused benzopyran core (6-m... 0.278749 1570 NPA001570 Dalienxanthone C ... NaN NaN NaN NaN NaN NaN NaN NaN classyfire_subclass 1-benzopyrans

293902 rows × 57 columns

merged_df_melted.groupby(['classyfire_level']).size()
classyfire_level
classyfire_class       146951
classyfire_subclass    146951
dtype: int64
top_pairs = top_significant_pairs(merged_df_melted, "classyfire_term", "c3p_class_name", N=100, M=100, top_k=100)
top_pairs.to_csv("../../notebooks-output/npatlas/top_pairs.csv", index=False)
top_pairs.head(100)
Overall Chi-square: 552931.9523, p-value: 0.00e+00, dof: 9801

classyfire_term c3p_class_name observed_count expected_count std_resid
0 Macrolides and analogues macrolide 1071 41.207559 160.421014
1 Peptidomimetics peptide antibiotic 1634 171.772479 111.567618
2 Depsipeptides peptide antibiotic 1216 107.192346 107.096265
3 Polypeptides polypeptide 189 3.828501 94.636739
4 Indoles indole alkaloid 300 10.023272 91.592176
... ... ... ... ... ...
95 Steroids and steroid derivatives triterpenoid 588 156.376690 34.515875
96 Triterpenoids 11beta-hydroxy steroid 514 126.980949 34.344964
97 Anthraquinones polyphenol 258 40.238728 34.328792
98 Terphenyls guaiacols 41 1.365584 33.916674
99 Benzodiazines aralkylamine 49 1.940210 33.785130

100 rows × 5 columns

top_pairs = top_significant_pairs(merged_df_melted, "classyfire_term", "lipidmaps_terms_flat", N=100, M=100, top_k=100)
top_pairs.head(100)
Overall Chi-square: 2849804.4511, p-value: 0.00e+00, dof: 9603

classyfire_term lipidmaps_terms_flat observed_count expected_count std_resid
0 Anthracyclines Anthracyclinones (PK1305) 1008 5.019881 447.657257
1 Angucyclines Angucyclines (PK08) 577 2.070941 399.512535
2 Depsides and depsidones Depsides and depsidones (PK1308) 597 2.324197 390.071150
3 Isoflavonoids Isoflavonoids (PK1205); Benzopyranoids (PK1311) 263 0.565195 349.077821
4 Isochromanequinones Naphthalenes and naphthoquinones (PK1302); Ben... 242 0.481829 347.939265
... ... ... ... ... ...
95 1-benzopyrans Dicarboxylic acids (FA0117); Benzopyranoids (P... 345 30.227066 57.253150
96 Prenol lipids C20 isoprenoids (diterpenes) (PR0104); Prenol ... 987 192.710960 57.217037
97 Fatty amides N-acyl amines (FA0802); Fatty Acyls (FA); Fatt... 235 14.846745 57.135954
98 Prenol lipids Bile acids and derivatives (ST04); Sterol Lipi... 977 190.758468 56.926446
99 Depsipeptides Macrolides and lactone polyketides (PK04) 821 142.260958 56.906250

100 rows × 5 columns

top_pairs = top_significant_pairs(merged_df_melted, "predicted_chebi_terms", "c3p_class_name", N=100, M=100, top_k=100)
#top_pairs.to_csv("../../notebooks-output/npatlas/top_pairs.csv", index=False)
top_pairs.head(100)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/Library/Caches/pypoetry/virtualenvs/c3p-93U7KWO_-py3.11/lib/python3.11/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'predicted_chebi_terms'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[68], line 1
----> 1 top_pairs = top_significant_pairs(merged_df_melted, "predicted_chebi_terms", "c3p_class_name", N=100, M=100, top_k=100)
      2 #top_pairs.to_csv("../../notebooks-output/npatlas/top_pairs.csv", index=False)
      3 top_pairs.head(100)

Cell In[47], line 30, in top_significant_pairs(df, col1, col2, N, M, top_k)
      6 """
      7 Return a DataFrame of the most positively significant (over-represented) 
      8 category pairs, based on standardized residuals from the chi-square test.
   (...)
     26                   [col1, col2, observed_count, expected_count, std_resid]
     27 """
     29 # 1. Identify top N categories in col1
---> 30 top_n_categories_col1 = df[col1].value_counts().nlargest(N).index
     31 # 2. Identify top M categories in col2
     32 top_m_categories_col2 = df[col2].value_counts().nlargest(M).index

File ~/Library/Caches/pypoetry/virtualenvs/c3p-93U7KWO_-py3.11/lib/python3.11/site-packages/pandas/core/frame.py:4102, in DataFrame.__getitem__(self, key)
   4100 if self.columns.nlevels > 1:
   4101     return self._getitem_multilevel(key)
-> 4102 indexer = self.columns.get_loc(key)
   4103 if is_integer(indexer):
   4104     indexer = [indexer]

File ~/Library/Caches/pypoetry/virtualenvs/c3p-93U7KWO_-py3.11/lib/python3.11/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key)
   3807     if isinstance(casted_key, slice) or (
   3808         isinstance(casted_key, abc.Iterable)
   3809         and any(isinstance(x, slice) for x in casted_key)
   3810     ):
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.
   3817     self._check_indexing_error(key)

KeyError: 'predicted_chebi_terms'