Collect — RFD3
Get structures
# SafeProtein hazards (429 structures)
./data_pipelines/download_safeprotein.sh
# UniProt benigns with matched PDBs
./data_pipelines/download_uniprot_pdb_benigns.sh 500 100 300Build sources + filter
# Merge CSVs, filter residues ≤300, balance per length bucket
uv run python -m data_pipelines.filter_pdbs \
--sources tutorials/sae_data_rfd3_partial/sources.csv \
--out tutorials/sae_data_rfd3_partial/sources_filtered.csv \
--bin-size 50
uv run python -m data_pipelines.build_inputs \
--sources tutorials/sae_data_rfd3_partial/sources_filtered.csv \
--out tutorials/sae_data_rfd3_partial/train_inputs.json \
--model rfd3 --partial-t 5.0inputs.json format:
{
"hazard_P00626": { "input": "/path/to/3g8g.pdb", "partial_t": 5.0 },
"benign_A0PK11": { "input": "/path/to/2kw3.pdb", "partial_t": 5.0 }
}Collect
saffron collect model=rfd3 \
hooks=rfd3_partial \
inputs=tutorials/sae_data_rfd3_partial/train_inputs.json \
out_dir=outputs/collect/rfd3_trainFor sequences without PDBs, first fold with RF3: rf3 fold inputs=rf3_inputs.json out_dir=pdbs/ skip_existing=True, then run attach_pdbs and optionally filter_pdbs.
Output: outputs/collect/rfd3_train/activations/activations.h5
Last updated on