Added residue replacement to handle protein chemical modifications and NCAAs

This commit is contained in:
gfzhou 2024-06-12 10:38:09 -07:00
parent bf214835d6
commit d1950f2ba6
6 changed files with 124 additions and 5 deletions

View file

@ -263,6 +263,37 @@ becomes this so it can be parsed correctly:
We know this syntax is hard to work with and we are happy to review PRs if anyone in the community can figure out how to specify all the necessary requirements in a more user friendly way!
<a id="residue_replacement"></a>
### Predicting Proteins with Chemical Modifications or Non-Carnonical Amino Acids
To predict proteins with chemically modified residues or non-canonical amino acids, you can use residue replacement. This involves replacing the chemically modified residue or NCAA with a small molecule file that defines the structure of the modified residue. Here is an example of predicting a phosphorylated protein structure. (from `rf2aa/config/inference/residue_replacement.yaml`)
```
defaults:
- base
job_name: "1h4x"
protein_inputs:
A:
fasta_file: examples/residue_replacement/1h4x.fasta
residue_replacement:
B:
protein_chain: A
residue_index_to_replace: 57
input: examples/residue_replacement/SEP_ideal_trim.sdf
input_type: "sdf"
N_index_atom: 1
C_index_atom: 5
loader_params:
MAXCYCLE: 10
```
To predict the example, run:
```
python -m rf2aa.run_inference --config-name residue_replacement
```
In this example, we use the phosphoserine structure defined in `SEP_ideal_trim.sdf` and treat it as an atomized residue to replace the residue 57 in chain A. Please note that one extra oxygen atom has to be removed from the carboxylic group. `N_index_atom` and `C_index_atom` define the atoms to connect to the previous and the next residues, respectively.
<a id="outputs"></a>
### Understanding model outputs

View file

@ -0,0 +1,3 @@
>1H4X_1|Chains A, B|ANTI-SIGMA F FACTOR ANTAGONIST|BACILLUS SPHAERICUS (1421)
MAFQLEMVTRETVVIRLFGELDHHAVEQIRAKISTAIFQGAVTTIIWNFERLSFMDSSGVGLVLGRMRELEAVAGRTILLNPSPTMRKVFQFSGLGPWMMDATEEEAIDRVRGIVNG

View file

@ -0,0 +1,37 @@
SEP
PyMOL2.6 3D 0
10 9 0 0 1 0 0 0 0 0999 V2000
1.8550 0.4210 1.7510 N 0 0 0 0 0 0 0 0 0 0 0 0
0.4010 0.6200 1.6870 C 0 0 1 0 0 0 0 0 0 0 0 0
-0.1390 0.0150 0.3910 C 0 0 0 0 0 0 0 0 0 0 0 0
0.4770 0.6550 -0.7270 O 0 0 0 0 0 0 0 0 0 0 0 0
-0.2490 -0.0530 2.8670 C 0 0 0 0 0 0 0 0 0 0 0 0
0.2540 -1.0380 3.3540 O 0 0 0 0 0 0 0 0 0 0 0 0
-0.1350 -0.0270 -2.0500 P 0 0 0 0 0 0 0 0 0 0 0 0
-1.6010 0.1720 -2.0740 O 0 0 0 0 0 0 0 0 0 0 0 0
0.5200 0.6490 -3.3560 O 0 0 0 0 0 0 0 0 0 0 0 0
0.1910 -1.6030 -2.0410 O 0 0 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0
2 3 1 0 0 0 0
2 5 1 0 0 0 0
3 4 1 0 0 0 0
4 7 1 0 0 0 0
5 6 2 0 0 0 0
7 8 2 0 0 0 0
7 9 1 0 0 0 0
7 10 1 0 0 0 0
M END
> <OPENEYE_ISO_SMILES>
C([C@@H](C(=O)O)N)OP(=O)(O)O
> <OPENEYE_INCHI>
InChI=1S/C3H8NO6P/c4-2(3(5)6)1-10-11(7,8)9/h2H,1,4H2,(H,5,6)(H2,7,8,9)/t2-/m0/s1
> <OPENEYE_INCHIKEY>
BZQFBWGGLXLEPQ-REOHCLBHSA-N
> <FORMULA>
C3H8NO6P
$$$$

View file

@ -0,0 +1,19 @@
defaults:
- base
job_name: "1h4x"
protein_inputs:
A:
fasta_file: examples/residue_replacement/1h4x.fasta
residue_replacement:
B:
protein_chain: A
residue_index_to_replace: 57
input: examples/residue_replacement/SEP_ideal_trim.sdf
input_type: "sdf"
N_index_atom: 1
C_index_atom: 5
loader_params:
MAXCYCLE: 10

View file

@ -27,6 +27,31 @@ class AtomizedResidue:
original_chain: str
index_in_original_chain: int
def load_residue_replacement( residue_replacement, model_runner ):
chainid_to_input = {}
residues_to_atomize = []
for chain in residue_replacement:
input_file = residue_replacement[chain]["input"]
input_type = residue_replacement[chain]["input_type"]
assert input_type in ["sdf", "mol2", "pdb"], "only sdf, mol2 and pdb files are supported"
obmol, msa, ins, xyz, mask = parse_mol(
input_file, filetype=input_type, string=False, generate_conformer=True
)
input = compute_features_from_obmol(obmol, msa, xyz, model_runner)
chainid_to_input[chain] = input
N_index_atom = int(residue_replacement[chain].N_index_atom)
C_index_atom = int(residue_replacement[chain].C_index_atom)
residues_to_atomize.append(AtomizedResidue(
chain,
0,
N_index_atom-1,
C_index_atom-1,
residue_replacement[chain].protein_chain,
int(residue_replacement[chain].residue_index_to_replace) - 1
))
return chainid_to_input, residues_to_atomize
def load_covalent_molecules(protein_inputs, config, model_runner):
if config.covale_inputs is None:

View file

@ -5,7 +5,7 @@ import torch.nn as nn
from dataclasses import asdict
from rf2aa.data.merge_inputs import merge_all
from rf2aa.data.covale import load_covalent_molecules
from rf2aa.data.covale import load_covalent_molecules, load_residue_replacement
from rf2aa.data.nucleic_acid import load_nucleic_acid
from rf2aa.data.protein import generate_msa_and_load_protein
from rf2aa.data.small_molecule import load_small_molecule
@ -86,10 +86,14 @@ class ModelRunner:
sm_inputs[chain] = sm_input
if self.config.residue_replacement is not None:
# add to the sm_inputs list
# add to residues to atomize
raise NotImplementedError("Modres inference is not implemented")
for chain in self.config.residue_replacement:
protein_chain = self.config.residue_replacement[chain].protein_chain
if protein_chain not in protein_inputs:
raise ValueError(f"Protein chain {protein_chain} not found in protein inputs")
sm_inputs, residues_to_atomize_replacement = load_residue_replacement(self.config.residue_replacement, self)
sm_inputs.update(sm_inputs)
residues_to_atomize.extend(residues_to_atomize_replacement)
raw_data = merge_all(protein_inputs, na_inputs, sm_inputs, residues_to_atomize, deterministic=self.deterministic)
self.raw_data = raw_data