Benchmark Summarization
This notebook provides comprehensive summary statistics for the C3PO (CHEBI Classification Programs Ontology) Benchmark dataset. The C3PO benchmark is designed to test the ability of models to classify chemical compounds (represented as SMILES strings) into CHEBI ontology classes.
Purpose and Overview
The analysis in this notebook: 1. Examines the distribution of classes and their properties 2. Analyzes the chemical structures in the dataset 3. Explores relationships between structures and classes 4. Generates summary statistics for evaluation
This data was used to produce CSVs for submission to the Hugging Face C3PO dataset.
from pathlib import Path
import pandas as pd
DIR = Path("../../results/2025/benchmark/")
from c3p.datamodel import Dataset
with open(DIR / "dataset.json") as f:
dataset = Dataset.model_validate_json(f.read())
cls_df = as_df(dataset.classes)
# Display a preview of the classes dataframe
cls_df.head(10)
import matplotlib.pyplot as plt
import seaborn as sns
# Set style for plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("deep")
plt.rcParams.update({'font.size': 12})
plt.figure(figsize=(12, 6))
# Create bar chart for parents count distribution
count_values = cls_df['parents_count'].value_counts().sort_index()
plt.bar(count_values.index, count_values.values)
plt.title('Distribution of Parent Classes per CHEBI Class')
plt.xlabel('Number of Parent Classes')
plt.ylabel('Count')
plt.xticks(range(0, int(cls_df['parents_count'].max()) + 1))
plt.tight_layout()
plt.show()
# Summary statistics for parents_count
print("Summary of parents per class:")
print(f"- Total classes: {len(cls_df)}")
print(f"- Average parents per class: {cls_df['parents_count'].mean():.2f}")
print(f"- Maximum parents for a class: {cls_df['parents_count'].max()}")
print(f"- Classes with no parents: {len(cls_df[cls_df['parents_count'] == 0])}")
# Analyze distribution of examples per class
plt.figure(figsize=(12, 6))
# Bin the counts for better visualization
bins = [0, 50, 100, 200, 500, 1000, 5000]
labels = ['0-50', '51-100', '101-200', '201-500', '501-1000', '1001-5000']
binned_counts = pd.cut(cls_df['all_positive_examples_count'], bins=bins, labels=labels)
binned_distribution = binned_counts.value_counts().sort_index()
plt.bar(binned_distribution.index, binned_distribution.values, color='green')
plt.title('Distribution of Positive Examples per CHEBI Class')
plt.xlabel('Number of Examples')
plt.ylabel('Count of Classes')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Summary statistics for examples per class
print("Summary of examples per class:")
print(f"- Average examples per class: {cls_df['all_positive_examples_count'].mean():.2f}")
print(f"- Median examples per class: {cls_df['all_positive_examples_count'].median():.2f}")
print(f"- Maximum examples for a class: {cls_df['all_positive_examples_count'].max()}")
print(f"- Minimum examples for a class: {cls_df['all_positive_examples_count'].min()}")
Chemical Structures Analysis
Next, we'll analyze the chemical structures in our dataset. We'll examine their distribution, complexity (based on SMILES length), and their relationship to CHEBI classes.
CHEBI Classes Analysis
The dataframe above contains information about each CHEBI class in the benchmark. Let's examine the distribution of key properties including:
- Parent class relationships
- Cross-references to external databases
- Number of positive examples per class
Let's check how many structures are part of the validation set and examine the distribution of structures across CHEBI classes.
cls_df = as_df(dataset.classes)
cls_df
cls_df.describe(include='all')
# Analyze the distribution of structures by class count
plt.figure(figsize=(12, 6))
# Create bar chart for class membership count
class_count_distribution = structures_df['cls_ids_count'].value_counts().sort_index()
plt.bar(class_count_distribution.index, class_count_distribution.values, color='purple')
plt.title('Distribution of Structures by Number of Classes')
plt.xlabel('Number of CHEBI Classes')
plt.ylabel('Count of Structures')
plt.xticks(range(0, int(structures_df['cls_ids_count'].max()) + 1, 2))
plt.tight_layout()
plt.show()
# Print validation set information
validation_count = structures_df['in_validation_set'].sum()
print(f"Validation set size: {validation_count} structures ({validation_count/len(structures_df)*100:.2f}% of total)")
# Print structure distribution statistics
print("\nStructure membership in CHEBI classes:")
print(f"- Structures not belonging to any class: {len(structures_df[structures_df['cls_ids_count'] == 0])}")
print(f"- Structures belonging to exactly one class: {len(structures_df[structures_df['cls_ids_count'] == 1])}")
print(f"- Structures belonging to multiple classes: {len(structures_df[structures_df['cls_ids_count'] > 1])}")
print(f"- Maximum classes per structure: {structures_df['cls_ids_count'].max()}")
# Analyze SMILES string length as a measure of molecular complexity
plt.figure(figsize=(12, 6))
# Bin the lengths for better visualization
bins = [0, 25, 50, 100, 200, 500, 2000]
labels = ['0-25', '26-50', '51-100', '101-200', '201-500', '501-2000']
binned_lengths = pd.cut(structures_df['smiles_length'], bins=bins, labels=labels)
length_distribution = binned_lengths.value_counts().sort_index()
plt.bar(length_distribution.index, length_distribution.values, color='orange')
plt.title('Distribution of SMILES String Lengths')
plt.xlabel('SMILES Length')
plt.ylabel('Count of Structures')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Print SMILES length statistics
print("SMILES length statistics:")
print(f"- Average SMILES length: {structures_df['smiles_length'].mean():.2f}")
print(f"- Median SMILES length: {structures_df['smiles_length'].median():.2f}")
print(f"- Minimum SMILES length: {structures_df['smiles_length'].min()}")
print(f"- Maximum SMILES length: {structures_df['smiles_length'].max()}")
structures_df = as_df(dataset.structures)
structures_df
validation_set = set(dataset.validation_examples)
def is_in_validation_set(row):
return row["smiles"] in validation_set
structures_df["in_validation_set"] = structures_df.apply(is_in_validation_set, axis=1)
structures_df
from collections import defaultdict
smiles_to_cls_id = defaultdict(set)
for cls in dataset.classes:
for smiles in cls.all_positive_examples:
smiles_to_cls_id[smiles].add(cls.id)
structures_df["cls_ids"] = structures_df["smiles"].apply(lambda x: smiles_to_cls_id[x])
structures_df
structures_df["cls_ids_count"] = structures_df["cls_ids"].apply(len)
structures_df
structures_df["smiles_length"] = structures_df["smiles"].apply(len)
structures_df
structures_df["has_wildcard"] = structures_df["smiles"].apply(lambda x: "*" in x)
structures_df["wildcard_count"] = structures_df["smiles"].apply(lambda x: x.count("*"))
# describe both categorical and quantitative
structures_df.describe(include='all')
classes_slim_df = cls_df[cls_df["xrefs_count"] > 0]
structures_df.to_csv(DIR / "structures.csv", index=False)
cls_df.to_csv(DIR / "classes.csv", index=False)
classes_slim_df.to_csv(DIR / "classes_slim.csv", index=False)
# also save summaries
cls_df.describe().T.to_csv(DIR / "classes_summary.csv")
classes_slim_df.describe().T.to_csv(DIR / "classes_slim_summary.csv")
structures_df.describe().T.to_csv(DIR / "structures_summary.csv")
#{k: v for k, [v] in dataset.model_dump().items() if not isinstance(v, list)}
metadata = pd.DataFrame({k: [v] for k, v in dataset.model_dump().items() if not isinstance(v, list)})
metadata.to_csv(DIR / "metadata.csv", index=False)
metadata