Skip to content

extractor

db_to_dataframe(session, prefix='CHEBI')

Convert a semsql database to a DataFrame

Parameters:

Name Type Description Default
session Session
required
prefix
'CHEBI'

Returns:

Source code in c3p/extractor.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def db_to_dataframe(session: Session, prefix = "CHEBI") -> pd.DataFrame:
    """
    Convert a semsql database to a DataFrame

    Args:
        session:
        prefix:

    Returns:

    """
    def _filter(q: Select, tbl=Statements) -> Select:
        if prefix:
            q = q.where(tbl.subject.startswith(prefix))
        else:
            q = q.where(not_(tbl.subject.startswith("_:")))
        return q
    # Data triples for annotations, e.g. SMILES, definition, mappings
    q = select(Statements.subject,
                  Statements.predicate,
                  Statements.value).where(Statements.value != None)
    q = _filter(q)
    triples = pd.DataFrame(session.execute(q).all())
    v_df = eav_to_df(triples)
    # Object triples
    q = select(Statements.subject,
               Statements.predicate,
               Statements.object).where(Statements.object != None)
    q = q.where(not_(Statements.predicate == RDFS_SUBCLASS_OF))
    q = _filter(q)
    triples = pd.DataFrame(session.execute(q).all())
    o_df = eav_to_df(triples, value_column="object")
    # Edge triples
    q = select(Edge.subject,
               Edge.predicate,
               Edge.object).where(not_(Edge.object.startswith("_:")))
    q = _filter(q, Edge)
    triples = pd.DataFrame(session.execute(q).all())
    e_df = eav_to_df(triples, value_column="object")
    # Ancestor triples
    q = select(EntailedEdge.subject,
               EntailedEdge.object).where(EntailedEdge.predicate == RDFS_SUBCLASS_OF)
    q = q.where(not_(EntailedEdge.object.startswith("_:")))
    q = _filter(q, EntailedEdge)
    triples = pd.DataFrame(session.execute(q).all())
    triples['predicate'] = "entailed_subclass_of"
    a_df = eav_to_df(triples, value_column="object")
    df = v_df.merge(o_df, on='subject', how='outer').merge(e_df, on='subject', how='outer').merge(a_df, on='subject',
                                                                                                  how='outer')
    return df

eav_to_df(eav_df, value_column='value')

Convert an EAV DataFrame to a wide-format DataFrame.

Example:

>>> test_eav_df = pd.DataFrame({
...    'subject': ['a', 'a', 'b', 'b', 'b', 'c'],
...    'predicate': ['p1', 'p1', 'p1', 'p1', 'p2', 'p3'],
...    'value': ['v1', 'v2', 'v3', 'v4', 'v4', '']
... })
>>> eav_to_df(test_eav_df)
predicate subject        p1    p2    p3
0               a  [v1, v2]  None  None
1               b  [v3, v4]    v4  None
2               c      None  None

Parameters:

Name Type Description Default
eav_df DataFrame
required
value_column
'value'

Returns:

Source code in c3p/extractor.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def eav_to_df(eav_df: pd.DataFrame, value_column='value') -> pd.DataFrame:
    """
    Convert an EAV DataFrame to a wide-format DataFrame.

    Example:

        >>> test_eav_df = pd.DataFrame({
        ...    'subject': ['a', 'a', 'b', 'b', 'b', 'c'],
        ...    'predicate': ['p1', 'p1', 'p1', 'p1', 'p2', 'p3'],
        ...    'value': ['v1', 'v2', 'v3', 'v4', 'v4', '']
        ... })
        >>> eav_to_df(test_eav_df)
        predicate subject        p1    p2    p3
        0               a  [v1, v2]  None  None
        1               b  [v3, v4]    v4  None
        2               c      None  None

    Args:
        eav_df:
        value_column:

    Returns:

    """
    # For predicates that have multiple values per subject,
    # aggregate them into lists
    unmasked_df = (eav_df.groupby(['subject', 'predicate'])[value_column]
                   .agg(list)
                   .unstack(fill_value=[])
                   .reset_index())

    # If we know some predicates should be single-valued,
    # we can unwrap them from lists
    single_value_mask = unmasked_df.apply(lambda x: x.map(len) <= 1)
    df = unmasked_df.mask(single_value_mask, unmasked_df.apply(lambda x: x.map(lambda y: y[0] if y else None)))

    return df

sanitize_smiles(smiles_string)

Sanitizes a SMILES string by: 1. Removing whitespace 2. Removing invalid characters 3. Preserving valid SMILES characters including brackets, numbers, and symbols

Parameters:

Name Type Description Default
smiles_string str

Input SMILES string

required

Returns:

Name Type Description
str

Sanitized SMILES string

Source code in c3p/extractor.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def sanitize_smiles(smiles_string):
    """
    Sanitizes a SMILES string by:
    1. Removing whitespace
    2. Removing invalid characters
    3. Preserving valid SMILES characters including brackets, numbers, and symbols

    Args:
        smiles_string (str): Input SMILES string

    Returns:
        str: Sanitized SMILES string
    """
    # Remove whitespace
    smiles = smiles_string.strip()

    # Define valid SMILES characters
    # Includes:
    # - Atomic symbols (B, C, N, O, P, S, F, Cl, Br, I, etc.)
    # - Numbers and % for ring closures
    # - Special characters ([, ], (, ), =, #, /, \, @, +, -, ., *)
    # - Colons for aromatic bonds
    # - Commas for atom lists in brackets
    pattern = r'[^A-Za-z0-9\[\]\(\)=#/\\@+\-\.\*%:,]'

    # Remove invalid characters
    sanitized = re.sub(pattern, '', smiles)

    return sanitized

split_instances_for_class(cc, all_smiles, validation_proportion=0.2, max_validation_negative=1000)

Split instances for a chemical class into training and validation sets.

We assume that cc is already loaded with all positive instances as train_positive

Parameters:

Name Type Description Default
cc ChemicalClass
required
validation_proportion
0.2

Returns:

Source code in c3p/extractor.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
def split_instances_for_class(cc: ChemicalClass, all_smiles: Set[SMILES_STRING], validation_proportion=0.2, max_validation_negative=1000):
    """
    Split instances for a chemical class into training and validation sets.

    We assume that cc is already loaded with all positive instances as train_positive

    Args:
        cc:
        validation_proportion:

    Returns:

    """
    all_positive_instances = copy(cc.train_positive)
    num_positive_instances = len(all_positive_instances)
    num_validate_positive = int(num_positive_instances * validation_proportion)
    num_train_positive = num_positive_instances - num_validate_positive
    # Shuffle instances
    random.shuffle(all_positive_instances)
    cc.train_positive = all_positive_instances[:num_train_positive]
    cc.validate_positive = all_positive_instances[num_train_positive:]
    cc.num_train_positive = len(cc.train_positive)
    cc.num_validate_positive = len(cc.validate_positive)
    all_negative_instances = list(all_smiles - set(all_positive_instances))
    random.shuffle(all_negative_instances)
    num_negative_instances = len(all_negative_instances)
    num_validate_negative = min(num_negative_instances * validation_proportion, max_validation_negative)
    cc.validate_negative = all_negative_instances[:num_validate_negative]
    cc.train_negative = None  # can be inferred
    cc.num_validate_negative = len(cc.validate_negative)
    cc.num_train_negative = num_negative_instances - num_validate_negative

validate_dataset(dataset)

Validate a dataset

Parameters:

Name Type Description Default
dataset Dataset
required

Returns:

Source code in c3p/extractor.py
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def validate_dataset(dataset: Dataset) -> Iterator[Tuple[ChemicalStructure, str]]:
    """
    Validate a dataset

    Args:
        dataset:

    Returns:

    """
    for s in dataset.structures:
        smiles_str = s.smiles
        smiles_str_sanitized = sanitize_smiles(smiles_str)
        if smiles_str_sanitized != smiles_str:
            yield s, smiles_str_sanitized
        _mol = Chem.MolFromSmiles(smiles_str)