Skip to content

DataFrame Schema

schema

DataFrame schema helpers for FRUST results.

StructureMetadata dataclass

Parsed structure identity independent of display/file names.

Source code in frust/schema.py
25
26
27
28
29
30
31
32
33
34
35
36
@dataclass(frozen=True)
class StructureMetadata:
    """Parsed structure identity independent of display/file names."""

    structure_id: str
    custom_name: str
    substrate_name: str
    structure_type: str
    molecule_role: str
    rpos: Any
    smiles: str | None = None
    input_smiles: str | None = None

canonical_output_columns

canonical_output_columns(df: DataFrame) -> pd.DataFrame

Rename legacy output suffixes to the compact schema.

Source code in frust/schema.py
63
64
65
66
67
68
69
70
71
72
73
def canonical_output_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Rename legacy output suffixes to the compact schema."""
    renamed: dict[str, str] = {}
    for col in df.columns:
        text = str(col)
        for old, new in OUTPUT_SUFFIXES.items():
            old_suffix = f"-{old}"
            if text.endswith(old_suffix):
                renamed[col] = text[: -len(old)] + new
                break
    return df.rename(columns=renamed) if renamed else df

energy_columns

energy_columns(df: DataFrame) -> list[str]

Return energy-like columns in dataframe order.

Source code in frust/schema.py
44
45
46
def energy_columns(df: pd.DataFrame) -> list[str]:
    """Return energy-like columns in dataframe order."""
    return [c for c in df.columns if str(c).endswith(ENERGY_SUFFIXES)]

infer_group_columns

infer_group_columns(df: DataFrame) -> list[str]

Choose columns that identify one chemical object for lowest filtering.

Source code in frust/schema.py
91
92
93
94
def infer_group_columns(df: pd.DataFrame) -> list[str]:
    """Choose columns that identify one chemical object for lowest filtering."""
    preferred = ["substrate_name", "structure_type", "molecule_role", "rpos"]
    return [col for col in preferred if col in df.columns]

latest_opt_coords_column

latest_opt_coords_column(prefix: str, df: DataFrame) -> str | None

Find the optimized coordinate column matching a vibration prefix.

Source code in frust/schema.py
54
55
56
57
58
59
60
def latest_opt_coords_column(prefix: str, df: pd.DataFrame) -> str | None:
    """Find the optimized coordinate column matching a vibration prefix."""
    for suffix in ("oc", "opt_coords"):
        col = f"{prefix}{suffix}"
        if col in df.columns:
            return col
    return None

metadata_from_mapping

metadata_from_mapping(metadata: dict[str, Any] | None, *, fallback_name: str, smiles: str | None = None) -> StructureMetadata

Build complete metadata from an optional partial mapping.

Source code in frust/schema.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def metadata_from_mapping(
    metadata: dict[str, Any] | None,
    *,
    fallback_name: str,
    smiles: str | None = None,
) -> StructureMetadata:
    """Build complete metadata from an optional partial mapping."""
    if not metadata:
        return parse_structure_name(fallback_name, smiles=smiles)

    parsed = parse_structure_name(str(metadata.get("custom_name", fallback_name)), smiles=smiles)
    substrate = metadata.get("substrate_name", parsed.substrate_name)
    rpos = metadata.get("rpos", parsed.rpos)
    if rpos is None:
        rpos = pd.NA
    return StructureMetadata(
        structure_id=str(metadata.get("structure_id", parsed.structure_id)),
        custom_name=str(metadata.get("custom_name", fallback_name)),
        substrate_name=str(substrate),
        structure_type=str(metadata.get("structure_type", parsed.structure_type)).upper(),
        molecule_role=str(metadata.get("molecule_role", parsed.molecule_role)),
        rpos=int(rpos) if rpos is not pd.NA and not pd.isna(rpos) else pd.NA,
        smiles=metadata.get("smiles", smiles),
        input_smiles=metadata.get("input_smiles", metadata.get("smiles", smiles)),
    )

normal_termination_columns

normal_termination_columns(df: DataFrame) -> list[str]

Return normal-termination columns in dataframe order.

Source code in frust/schema.py
49
50
51
def normal_termination_columns(df: pd.DataFrame) -> list[str]:
    """Return normal-termination columns in dataframe order."""
    return [c for c in df.columns if str(c).endswith(NORMAL_TERMINATION_SUFFIXES)]

normalize_dataframe

normalize_dataframe(df: DataFrame) -> pd.DataFrame

Normalize old FRUST dataframe columns to the current schema.

This is intentionally conservative: it does not add ligand_name back. It only maps old data into canonical names so older parquet files can still be read by current utilities.

Source code in frust/schema.py
76
77
78
79
80
81
82
83
84
85
86
87
88
def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Normalize old FRUST dataframe columns to the current schema.

    This is intentionally conservative: it does not add ``ligand_name`` back.
    It only maps old data into canonical names so older parquet files can still
    be read by current utilities.
    """
    out = canonical_output_columns(df.copy())
    if "substrate_name" not in out.columns and "ligand_name" in out.columns:
        out = out.rename(columns={"ligand_name": "substrate_name"})
    elif "ligand_name" in out.columns:
        out = out.drop(columns=["ligand_name"])
    return out

output_column

output_column(prefix: str, key: str) -> str

Build a dataframe output column with the canonical short suffix.

Source code in frust/schema.py
39
40
41
def output_column(prefix: str, key: str) -> str:
    """Build a dataframe output column with the canonical short suffix."""
    return f"{prefix}-{OUTPUT_SUFFIXES.get(key, key)}"

parse_structure_name

parse_structure_name(name: str, smiles: str | None = None) -> StructureMetadata

Parse legacy structure names when no structured metadata is available.

Source code in frust/schema.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def parse_structure_name(name: str, smiles: str | None = None) -> StructureMetadata:
    """Parse legacy structure names when no structured metadata is available."""
    text = str(name)

    wrapped = re.match(
        r"^(?P<stype>(?:TS|INT)\d*)\((?P<body>.+)_rpos\((?P<rpos>\d+)\)\)$",
        text,
    )
    if wrapped:
        stype = wrapped.group("stype").upper()
        substrate = wrapped.group("body")
        rpos = int(wrapped.group("rpos"))
        return StructureMetadata(
            structure_id=f"{stype}:{substrate}:r{rpos}",
            custom_name=text,
            substrate_name=substrate,
            structure_type=stype,
            molecule_role="ts" if stype.startswith("TS") else stype.lower(),
            rpos=rpos,
            smiles=smiles,
            input_smiles=smiles,
        )

    rpos_match = re.match(r"^(?P<base>.+)_(?P<role>[^_]+)_rpos\((?P<rpos>\d+)\)$", text)
    if rpos_match:
        base = rpos_match.group("base")
        role = rpos_match.group("role")
        rpos = int(rpos_match.group("rpos"))
        substrate = _substrate_from_base(base)
        return StructureMetadata(
            structure_id=f"MOL:{substrate}:{role}:r{rpos}",
            custom_name=text,
            substrate_name=substrate,
            structure_type="MOL",
            molecule_role=role,
            rpos=rpos,
            smiles=smiles,
            input_smiles=smiles or base,
        )

    if "_" in text:
        base, role_or_name = text.rsplit("_", 1)
        role = role_or_name if role_or_name in _KNOWN_ROLES else "structure"
        substrate = role_or_name if role == "structure" else _substrate_from_base(base)
    else:
        role = text if text in _KNOWN_ROLES else "structure"
        substrate = text

    return StructureMetadata(
        structure_id=f"MOL:{substrate}:{role}",
        custom_name=text,
        substrate_name=substrate,
        structure_type="MOL",
        molecule_role=role,
        rpos=pd.NA,
        smiles=smiles,
        input_smiles=smiles,
    )