Skip to content

datamodel

ChemicalClass

Bases: BaseModel

Represents a class/grouping of chemical entities.

Source code in c3p/datamodel.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class ChemicalClass(BaseModel):
    """Represents a class/grouping of chemical entities."""
    id: str = Field(..., description="id/curie of the CHEBI class")
    name: str = Field(..., description="rdfs:label of the class in CHEBI")
    definition: Optional[str] = Field(None, description="definition of the structure from CHEBI")
    parents: Optional[List[str]] = Field(default=None, description="parent classes")
    xrefs: Optional[List[str]] = Field(default=None, description="mappings")
    all_positive_examples: List[SMILES_STRING] = []

    def lite_copy(self) -> "ChemicalClass":
        """
        Create a copy of the chemical class without the instance fields
        Returns:
        """
        cc = copy(self)
        cc.all_positive_examples = []
        #cc.train_positive = []
        #cc.train_negative = []
        #cc.validate_positive = []
        #cc.validate_negative = []
        return cc

lite_copy()

Create a copy of the chemical class without the instance fields Returns:

Source code in c3p/datamodel.py
35
36
37
38
39
40
41
42
43
44
45
46
def lite_copy(self) -> "ChemicalClass":
    """
    Create a copy of the chemical class without the instance fields
    Returns:
    """
    cc = copy(self)
    cc.all_positive_examples = []
    #cc.train_positive = []
    #cc.train_negative = []
    #cc.validate_positive = []
    #cc.validate_negative = []
    return cc

ChemicalStructure

Bases: BaseModel

Represents a chemical entity with a known specific structure/formula.

Source code in c3p/datamodel.py
12
13
14
15
16
17
18
19
20
21
22
23
class ChemicalStructure(BaseModel):
    """Represents a chemical entity with a known specific structure/formula."""
    name: str = Field(..., description="rdfs:label of the structure in CHEBI")
    smiles: SMILES_STRING = Field(..., description="SMILES string derived from CHEBI")

    def __hash__(self):
        return hash(self.smiles)

    def __eq__(self, other):
        if not isinstance(other, ChemicalStructure):
            return NotImplemented
        return self.smiles == other.smiles

CodeStatistics

Bases: BaseModel

Code statistics

Source code in c3p/datamodel.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
class CodeStatistics(BaseModel):
    """Code statistics"""
    lines_of_code: int
    log_lines_of_code: float
    indent_by_line: List[int]
    max_indent: int
    imports: List[str]
    imports_count: int
    methods_called: List[str]
    methods_called_count: int
    smarts_strings: List[str]
    smarts_strings_count: int
    defs: List[str]
    defs_count: int
    returns: List[str]
    returns_count: int
    complexity: float

    @classmethod
    def from_code(cls, code: str):
        """Extract statistics from code"""
        imports = []
        lines = []
        ignored_lines = []
        indent_by_line = []
        returns = []
        defs = []
        last_line_indent = 0
        in_def = False
        for line in code.split("\n"):
            if line.startswith("__metadata__"):
                break
            num_spaces = len(line) - len(line.lstrip())
            line_indent = num_spaces // 4
            if num_spaces % 4:
                # likely not a "true" new line
                line_indent = last_line_indent
            elif line_indent > last_line_indent + 1:
                # likely a continuation of the previous line
                line_indent = last_line_indent
            last_line_indent = line_indent
            if in_def:
                if line.strip() and line_indent == 0:
                    pass
                    #in_def = False
                else:
                    if line.strip():
                        lines.append(line)
                    indent_by_line.append(line_indent)
            elif line.startswith("from") or line.startswith("import"):
                imports.append(line)
            elif line.startswith("def"):
                in_def = True
                defs.append(line.replace("def", "").strip())
            else:
                ignored_lines.append(line)
        lines_of_code = len(lines)
        max_indent = max(indent_by_line) if indent_by_line else 0
        imports_count = len(imports)
        method_re = re.compile(r"\.(\w+)\(")
        # TODO: this misses when a variable is passed
        smarts_re = re.compile(r"Chem.MolFromSmarts\((.+)\)")
        methods_called = []
        smarts_strings = []
        def de_quote(s):
            if s.startswith("'") and s.endswith("'"):
                return s[1:-1]
            if s.startswith('"') and s.endswith('"'):
                return s[1:-1]
            return s
        for line in lines:
            methods_called.extend(method_re.findall(line))
            smarts_strings.extend([de_quote(x) for x in smarts_re.findall(line)])
            line_lstrip = line.lstrip()
            if line_lstrip.startswith("return"):
                returns.append(line.replace("return", "").strip())
            if line_lstrip.startswith("def"):
                defs.append(line.replace("def", "").strip())
        methods_called = list(set(methods_called))
        methods_called_count = len(methods_called)
        smarts_strings = list(set(smarts_strings))
        smarts_strings_count = len(smarts_strings)
        defs_count = len(defs)
        returns_count = len(returns)
        log_loc = math.log(lines_of_code) if lines_of_code else 0
        complexity = (methods_called_count + defs_count + returns_count + log_loc + max_indent) / 5
        return cls(
            lines_of_code=lines_of_code,
            log_lines_of_code=log_loc,
            indent_by_line=indent_by_line,
            max_indent=max_indent,
            imports=imports,
            imports_count=imports_count,
            methods_called=methods_called,
            methods_called_count=methods_called_count,
            smarts_strings=smarts_strings,
            smarts_strings_count=smarts_strings_count,
            defs=defs,
            defs_count=defs_count,
            returns=returns,
            returns_count=returns_count,
            complexity=complexity,
        )

from_code(code) classmethod

Extract statistics from code

Source code in c3p/datamodel.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
@classmethod
def from_code(cls, code: str):
    """Extract statistics from code"""
    imports = []
    lines = []
    ignored_lines = []
    indent_by_line = []
    returns = []
    defs = []
    last_line_indent = 0
    in_def = False
    for line in code.split("\n"):
        if line.startswith("__metadata__"):
            break
        num_spaces = len(line) - len(line.lstrip())
        line_indent = num_spaces // 4
        if num_spaces % 4:
            # likely not a "true" new line
            line_indent = last_line_indent
        elif line_indent > last_line_indent + 1:
            # likely a continuation of the previous line
            line_indent = last_line_indent
        last_line_indent = line_indent
        if in_def:
            if line.strip() and line_indent == 0:
                pass
                #in_def = False
            else:
                if line.strip():
                    lines.append(line)
                indent_by_line.append(line_indent)
        elif line.startswith("from") or line.startswith("import"):
            imports.append(line)
        elif line.startswith("def"):
            in_def = True
            defs.append(line.replace("def", "").strip())
        else:
            ignored_lines.append(line)
    lines_of_code = len(lines)
    max_indent = max(indent_by_line) if indent_by_line else 0
    imports_count = len(imports)
    method_re = re.compile(r"\.(\w+)\(")
    # TODO: this misses when a variable is passed
    smarts_re = re.compile(r"Chem.MolFromSmarts\((.+)\)")
    methods_called = []
    smarts_strings = []
    def de_quote(s):
        if s.startswith("'") and s.endswith("'"):
            return s[1:-1]
        if s.startswith('"') and s.endswith('"'):
            return s[1:-1]
        return s
    for line in lines:
        methods_called.extend(method_re.findall(line))
        smarts_strings.extend([de_quote(x) for x in smarts_re.findall(line)])
        line_lstrip = line.lstrip()
        if line_lstrip.startswith("return"):
            returns.append(line.replace("return", "").strip())
        if line_lstrip.startswith("def"):
            defs.append(line.replace("def", "").strip())
    methods_called = list(set(methods_called))
    methods_called_count = len(methods_called)
    smarts_strings = list(set(smarts_strings))
    smarts_strings_count = len(smarts_strings)
    defs_count = len(defs)
    returns_count = len(returns)
    log_loc = math.log(lines_of_code) if lines_of_code else 0
    complexity = (methods_called_count + defs_count + returns_count + log_loc + max_indent) / 5
    return cls(
        lines_of_code=lines_of_code,
        log_lines_of_code=log_loc,
        indent_by_line=indent_by_line,
        max_indent=max_indent,
        imports=imports,
        imports_count=imports_count,
        methods_called=methods_called,
        methods_called_count=methods_called_count,
        smarts_strings=smarts_strings,
        smarts_strings_count=smarts_strings_count,
        defs=defs,
        defs_count=defs_count,
        returns=returns,
        returns_count=returns_count,
        complexity=complexity,
    )

Config

Bases: BaseModel

Experimental setup

Source code in c3p/datamodel.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
class Config(BaseModel):
    """Experimental setup"""
    llm_model_name: str = "gpt-4o"
    experiment_local_name: Optional[str] = None
    f1_threshold: float = 0.8
    max_attempts: int = 4
    min_positive_examples_for_training: int = 23
    min_negative_examples_for_training: int = 23
    min_positive_examples_for_validation: int = 5
    min_negative_examples_for_validation: int = 5
    max_positive_instances: Optional[int] = None
    max_positive_to_test: Optional[int] = None
    max_negative_to_test: Optional[int] = None
    max_positive_in_prompt: int = 50
    max_negative_in_prompt: int = 20
    max_examples_in_feedback: Optional[int] = 25
    test_proportion: float = 0.2
    use_definitions: bool = True
    use_the_force: bool = False

    @property
    def experiment_name(self):
        ln = self.experiment_local_name or "undef"
        model_name = self.llm_model_name
        if "/" in model_name:
            model_name = model_name.split("/")[-1]
        return f"{model_name}-{ln}"

Dataset

Bases: BaseModel

Represents a dataset of chemical classes.

Source code in c3p/datamodel.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
class Dataset(BaseModel):
    """
    Represents a dataset of chemical classes.
    """
    ontology_version: Optional[str] = None
    min_members: Optional[int] = None
    max_members: Optional[int] = None
    classes: List[ChemicalClass]
    structures: List[ChemicalStructure] = None
    validation_examples: Optional[List[SMILES_STRING]] = None

    @property
    def name(self):
        return f"bench-{self.ontology_version}-{self.min_members}-{self.max_members}"

    def all_smiles(self) -> Set[SMILES_STRING]:
        return {s.smiles for s in self.structures}

    def smiles_to_instance(self) -> Dict[SMILES_STRING, ChemicalStructure]:
        return {s.smiles: s for s in self.structures}

    def get_chemical_class_by_id(self, class_id: str) -> ChemicalClass:
        for cc in self.classes:
            if cc.id == class_id:
                return cc
        raise ValueError(f"Class {class_id} not found in dataset")

    def get_chemical_class_by_name(self, class_name: str) -> ChemicalClass:
        for cc in self.classes:
            if cc.name == class_name:
                return cc
        raise ValueError(f"Class {class_name} not found in dataset")

EvaluationExperiment

Bases: BaseModel

Represents an evaluation experiment

Source code in c3p/datamodel.py
360
361
362
363
class EvaluationExperiment(BaseModel):
    """Represents an evaluation experiment"""
    config: Config
    evaluation_results: List[EvaluationResult]

EvaluationResult

Bases: BaseModel

Result of evaluating a model

Source code in c3p/datamodel.py
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
class EvaluationResult(BaseModel):
    """Result of evaluating a model"""
    train_results: ResultSet
    test_result: Result

    def calculate_reward(self):
        """Calculate derived statistics"""
        tr = self.test_result
        tr.calculate()
        br = self.train_results.best_result
        br.calculate()
        tp_reward = tr.num_true_positives * br.precision
        fp_penalty = tr.num_false_positives * br.precision
        fn_penalty = tr.num_false_negatives * br.negative_predictive_value
        reward = (tp_reward - fp_penalty) - fn_penalty
        return reward

    @property
    def markdown(self):
        """Generate markdown for the evaluation result"""
        return self.train_results.markdown

markdown property

Generate markdown for the evaluation result

calculate_reward()

Calculate derived statistics

Source code in c3p/datamodel.py
342
343
344
345
346
347
348
349
350
351
352
def calculate_reward(self):
    """Calculate derived statistics"""
    tr = self.test_result
    tr.calculate()
    br = self.train_results.best_result
    br.calculate()
    tp_reward = tr.num_true_positives * br.precision
    fp_penalty = tr.num_false_positives * br.precision
    fn_penalty = tr.num_false_negatives * br.negative_predictive_value
    reward = (tp_reward - fp_penalty) - fn_penalty
    return reward

Result

Bases: BaseModel

Result of running workflow on a chemical class

Source code in c3p/datamodel.py
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
class Result(BaseModel):
    """Result of running workflow on a chemical class"""
    chemical_class: ChemicalClass
    config: Optional[Config] = None
    code: str
    code_statistics: Optional[CodeStatistics] = None
    message: Optional[str] = None
    true_positives: Optional[List[Outcome]] = None
    false_positives: Optional[List[Outcome]] = None
    true_negatives: Optional[List[Outcome]] = None
    false_negatives: Optional[List[Outcome]] = None
    sample_true_negatives: Optional[List[Outcome]] = None
    sample_false_negatives: Optional[List[Outcome]] = None
    attempt: int = 0
    reasoning: Optional[str] = None
    success: bool = True  ## True if no runtime errors or compilation errors
    best: bool = False
    error: Optional[str] = None
    stdout: Optional[str] = None

    num_true_positives: Optional[int] = None
    num_false_positives: Optional[int] = None
    num_true_negatives: Optional[int] = None
    num_false_negatives: Optional[int] = None
    num_negatives: Optional[int] = None

    precision: Optional[float] = None
    recall: Optional[float] = None
    f1: Optional[float] = None
    accuracy: Optional[float] = None
    negative_predictive_value: Optional[float] = None

    def calculate(self):
        """Calculate derived statistics"""
        self.num_true_positives = len(self.true_positives or [])
        self.num_false_positives = len(self.false_positives or [])
        if self.num_true_negatives is None:
            self.num_true_negatives = len(self.true_negatives or [])
        if self.num_false_negatives is None:
            self.num_false_negatives = len(self.false_negatives or [])
        if self.num_true_positives + self.num_false_positives:
            self.precision = self.num_true_positives / (self.num_true_positives + self.num_false_positives)
        else:
            self.precision = 0.0
        if self.num_true_positives + self.num_false_negatives:
            self.recall = self.num_true_positives / (self.num_true_positives + self.num_false_negatives)
        else:
            self.recall = 0
        if self.precision and self.recall:
            self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)
        else:
            self.f1 = 0
        if self.num_true_positives + self.num_true_negatives + self.num_false_positives + self.num_false_negatives:
            self.accuracy = (self.num_true_positives + self.num_true_negatives) / (
                    self.num_true_positives + self.num_true_negatives + self.num_false_positives + self.num_false_negatives)
        if self.num_true_negatives + self.num_false_negatives:
            self.negative_predictive_value = self.num_true_negatives / (self.num_true_negatives + self.num_false_negatives)
        else:
            self.negative_predictive_value = 0
        if self.code and not self.code_statistics:
            self.code_statistics = CodeStatistics.from_code(self.code)

calculate()

Calculate derived statistics

Source code in c3p/datamodel.py
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
def calculate(self):
    """Calculate derived statistics"""
    self.num_true_positives = len(self.true_positives or [])
    self.num_false_positives = len(self.false_positives or [])
    if self.num_true_negatives is None:
        self.num_true_negatives = len(self.true_negatives or [])
    if self.num_false_negatives is None:
        self.num_false_negatives = len(self.false_negatives or [])
    if self.num_true_positives + self.num_false_positives:
        self.precision = self.num_true_positives / (self.num_true_positives + self.num_false_positives)
    else:
        self.precision = 0.0
    if self.num_true_positives + self.num_false_negatives:
        self.recall = self.num_true_positives / (self.num_true_positives + self.num_false_negatives)
    else:
        self.recall = 0
    if self.precision and self.recall:
        self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)
    else:
        self.f1 = 0
    if self.num_true_positives + self.num_true_negatives + self.num_false_positives + self.num_false_negatives:
        self.accuracy = (self.num_true_positives + self.num_true_negatives) / (
                self.num_true_positives + self.num_true_negatives + self.num_false_positives + self.num_false_negatives)
    if self.num_true_negatives + self.num_false_negatives:
        self.negative_predictive_value = self.num_true_negatives / (self.num_true_negatives + self.num_false_negatives)
    else:
        self.negative_predictive_value = 0
    if self.code and not self.code_statistics:
        self.code_statistics = CodeStatistics.from_code(self.code)

ResultSet

Bases: BaseModel

A set of results

Source code in c3p/datamodel.py
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
class ResultSet(BaseModel):
    """A set of results"""
    best_result: Optional[Result] = None
    results: List[Result]
    sorted_attempts: List[int] = []
    experiment_name: Optional[str] = None

    @classmethod
    def from_results(cls, results: List[Result]) -> "ResultSet":
        """Populate the result set from a list of results"""
        obj = cls(results=results, best_result=max(results, key=lambda r: r.f1))
        if obj.best_result:
            obj.best_result.best = True
        return obj

    @property
    def markdown(self):
        """Generate markdown for the result set"""
        br = self.best_result
        best_attempt = br.attempt if br else -1
        chem = br.chemical_class
        md = f"# Results for {chem.id} {chem.name}\n\n"
        for r in self.results:
            md += f"## Attempt {r.attempt}\n\n"
            if r.attempt == best_attempt:
                md += f"**Best result**\n\n"
            if r.message:
                md += f"### Feedback from previous attempt\n\n"
                md += f"{r.message}\n\n"
            if r.reasoning:
                md += f"### Reasoning\n\n"
                md += f"{r.reasoning}\n\n"

            md += "### Code\n\n"
            md += f"```python\n{r.code}\n```\n\n"
            if r.error:
                md += f"### Error\n\n"
                md += f"```python\n{r.error}\n```\n\n"
            md += f"Precision: {r.precision:.2f}\n\n"
            md += f"Recall: {r.recall:.2f}\n\n"
            md += f"F1: {r.f1:.2f}\n\n"
        return md

markdown property

Generate markdown for the result set

from_results(results) classmethod

Populate the result set from a list of results

Source code in c3p/datamodel.py
301
302
303
304
305
306
307
@classmethod
def from_results(cls, results: List[Result]) -> "ResultSet":
    """Populate the result set from a list of results"""
    obj = cls(results=results, best_result=max(results, key=lambda r: r.f1))
    if obj.best_result:
        obj.best_result.best = True
    return obj