add in protein multimer inference

This commit is contained in:
Rohith Krishna 2024-03-05 16:45:27 -08:00
parent f87f5b8cdf
commit 097ad85d4e
11 changed files with 426 additions and 20 deletions

View file

@ -0,0 +1,2 @@
>3FAP_1|Chain A|FK506-BINDING PROTEIN|Homo sapiens (9606)
GVQVETISPGDGRTFPKRGQTCVVHYTGMLEDGKKFDSSRDRNKPFKFMLGKQEVIRGWEEGVAQMSVGQRAKLTISPDYAYGATGHPGIIPPHATLVFDVELLKLE

View file

@ -0,0 +1,2 @@
>3FAP_2|Chain B|FKBP12-RAPAMYCIN ASSOCIATED PROTEIN|Homo sapiens (9606)
VAILWHEMWHEGLEEASRLYFGERNVKGMFEVLEPLHAMMERGPQTLKETSFNQAYGRDLMEAQEWCRKYMKSGNVKDLTQAWDLYYHVFRRIS

View file

@ -0,0 +1,322 @@
ARD
-OEChem-02232415173D
150154 0 1 0 0 0 0 0999 V2000
-1.7790 -1.8400 2.4660 O 0 0 0 0 0 0 0 0 0 0 0 0
-0.5750 -1.3280 2.8030 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.1380 -0.4090 2.1630 O 0 0 0 0 0 0 0 0 0 0 0 0
0.1530 -1.9370 3.9570 C 0 0 2 0 0 0 0 0 0 0 0 0
-0.5340 -1.5700 5.2770 C 0 0 0 0 0 0 0 0 0 0 0 0
0.1240 -2.3500 6.4190 C 0 0 0 0 0 0 0 0 0 0 0 0
1.6170 -2.0190 6.4880 C 0 0 0 0 0 0 0 0 0 0 0 0
2.2820 -2.4020 5.1570 C 0 0 0 0 0 0 0 0 0 0 0 0
1.5790 -1.6200 4.1030 N 0 0 0 0 0 0 0 0 0 0 0 0
2.2890 -0.7310 3.4090 C 0 0 0 0 0 0 0 0 0 0 0 0
1.7590 0.2990 3.0480 O 0 0 0 0 0 0 0 0 0 0 0 0
3.7290 -0.8580 3.1350 C 0 0 0 0 0 0 0 0 0 0 0 0
4.1990 -1.9680 3.1760 O 0 0 0 0 0 0 0 0 0 0 0 0
4.6500 0.2750 2.7820 C 0 0 1 0 0 0 0 0 0 0 0 0
6.0730 -0.0990 3.2010 C 0 0 1 0 0 0 0 0 0 0 0 0
7.0540 1.0150 2.8270 C 0 0 0 0 0 0 0 0 0 0 0 0
6.9100 1.2850 1.3170 C 0 0 0 0 0 0 0 0 0 0 0 0
5.4410 1.6400 1.0770 C 0 0 2 0 0 0 0 0 0 0 0 0
4.6020 0.5430 1.3850 O 0 0 0 0 0 0 0 0 0 0 0 0
4.2490 1.4380 3.5170 O 0 0 0 0 0 0 0 0 0 0 0 0
6.1240 -0.3020 4.7200 C 0 0 0 0 0 0 0 0 0 0 0 0
5.2000 2.2840 -0.2620 C 0 0 0 0 0 0 0 0 0 0 0 0
5.7060 1.5450 -1.4960 C 0 0 1 0 0 0 0 0 0 0 0 0
5.5470 0.0580 -1.3360 C 0 0 0 0 0 0 0 0 0 0 0 0
4.8870 2.0220 -2.6780 C 0 0 0 0 0 0 0 0 0 0 0 0
4.6730 3.5180 -2.8110 C 0 0 0 0 0 0 0 0 0 0 0 0
4.3300 1.2230 -3.5690 C 0 0 0 0 0 0 0 0 0 0 0 0
3.4840 1.7930 -4.6240 C 0 0 0 0 0 0 0 0 0 0 0 0
2.7570 0.9680 -5.4040 C 0 0 0 0 0 0 0 0 0 0 0 0
1.8610 1.5580 -6.3980 C 0 0 0 0 0 0 0 0 0 0 0 0
1.0770 0.7650 -7.1230 C 0 0 0 0 0 0 0 0 0 0 0 0
0.1320 1.3510 -8.1390 C 0 0 1 0 0 0 0 0 0 0 0 0
0.7560 1.2520 -9.5340 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.1760 0.5550 -8.1080 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.9730 0.9210 -6.8550 C 0 0 1 0 0 0 0 0 0 0 0 0
-2.7810 2.1950 -7.1240 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.9200 -0.2000 -6.5090 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.9910 -1.1750 -7.2160 O 0 0 0 0 0 0 0 0 0 0 0 0
-3.7750 -0.1010 -5.2690 C 0 0 2 0 0 0 0 0 0 0 0 0
-2.9350 0.0970 -4.1300 O 0 0 0 0 0 0 0 0 0 0 0 0
-2.7650 1.5080 -3.9830 C 0 0 0 0 0 0 0 0 0 0 0 0
-4.5910 -1.3810 -5.0870 C 0 0 2 0 0 0 0 0 0 0 0 0
-5.7300 -1.3510 -5.9480 O 0 0 0 0 0 0 0 0 0 0 0 0
-5.0540 -1.4780 -3.6520 C 0 0 0 0 0 0 0 0 0 0 0 0
-6.1090 -0.5300 -3.1490 C 0 0 0 0 0 0 0 0 0 0 0 0
-4.5170 -2.3830 -2.8800 C 0 0 0 0 0 0 0 0 0 0 0 0
-4.8930 -2.5500 -1.4300 C 0 0 1 0 0 0 0 0 0 0 0 0
-4.8210 -4.0370 -1.0630 C 0 0 0 0 0 0 0 0 0 0 0 0
-3.8900 -1.7740 -0.6090 C 0 0 0 0 0 0 0 0 0 0 0 0
-3.4830 -0.7100 -1.0220 O 0 0 0 0 0 0 0 0 0 0 0 0
-3.3920 -2.3030 0.7040 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.6090 -1.2170 1.4520 C 0 0 1 0 0 0 0 0 0 0 0 0
-3.5900 -0.2550 2.1240 C 0 0 2 0 0 0 0 0 0 0 0 0
-4.4730 0.3980 1.0580 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.8110 0.8260 2.8740 C 0 0 0 0 0 0 0 0 0 0 0 0
-3.7860 1.7040 3.6610 C 0 0 1 0 0 0 0 0 0 0 0 0
-4.4510 0.8710 4.7590 C 0 0 0 0 0 0 0 0 0 0 0 0
-5.4260 1.7490 5.5460 C 0 0 1 0 0 0 0 0 0 0 0 0
-6.0460 0.9710 6.5720 O 0 0 0 0 0 0 0 0 0 0 0 0
-7.3460 1.5260 6.7810 C 0 0 0 0 0 0 0 0 0 0 0 0
-4.6640 2.9150 6.1800 C 0 0 2 0 0 0 0 0 0 0 0 0
-5.5750 3.7350 6.9150 O 0 0 0 0 0 0 0 0 0 0 0 0
-4.0000 3.7480 5.0820 C 0 0 0 0 0 0 0 0 0 0 0 0
-3.0250 2.8700 4.2950 C 0 0 0 0 0 0 0 0 0 0 0 0
4.8030 -2.5110 -0.9950 C 0 0 0 0 0 0 0 0 0 0 0 0
6.1120 -2.2280 -1.2040 C 0 0 0 0 0 0 0 0 0 0 0 0
6.4930 -0.9110 -1.3840 C 0 0 0 0 0 0 0 0 0 0 0 0
4.1450 -3.8500 -0.7790 C 0 0 0 0 0 0 0 0 0 0 0 0
4.0620 -0.8880 -1.0410 S 0 0 0 0 0 0 0 0 0 0 0 0
0.0770 -3.0380 3.8490 H 0 0 0 0 0 0 0 0 0 0 0 0
-1.5890 -1.8360 5.2140 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.4290 -0.5030 5.4520 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.0020 -3.4180 6.2480 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.3530 -2.0760 7.3610 H 0 0 0 0 0 0 0 0 0 0 0 0
2.0800 -2.5780 7.3010 H 0 0 0 0 0 0 0 0 0 0 0 0
1.7450 -0.9500 6.6630 H 0 0 0 0 0 0 0 0 0 0 0 0
2.1540 -3.4630 4.9650 H 0 0 0 0 0 0 0 0 0 0 0 0
3.3350 -2.1320 5.1760 H 0 0 0 0 0 0 0 0 0 0 0 0
6.3710 -1.0230 2.7070 H 0 0 0 0 0 0 0 0 0 0 0 0
8.0740 0.6960 3.0480 H 0 0 0 0 0 0 0 0 0 0 0 0
6.8170 1.9170 3.3880 H 0 0 0 0 0 0 0 0 0 0 0 0
7.1900 0.3900 0.7630 H 0 0 0 0 0 0 0 0 0 0 0 0
7.5510 2.1180 1.0280 H 0 0 0 0 0 0 0 0 0 0 0 0
5.2120 2.4360 1.8310 H 0 0 0 0 0 0 0 0 0 0 0 0
3.3500 1.6510 3.2330 H 0 0 0 0 0 0 0 0 0 0 0 0
7.1360 -0.5820 5.0150 H 0 0 0 0 0 0 0 0 0 0 0 0
5.8430 0.6230 5.2200 H 0 0 0 0 0 0 0 0 0 0 0 0
5.4310 -1.0940 5.0040 H 0 0 0 0 0 0 0 0 0 0 0 0
5.6660 3.2750 -0.2390 H 0 0 0 0 0 0 0 0 0 0 0 0
4.1180 2.4310 -0.3660 H 0 0 0 0 0 0 0 0 0 0 0 0
6.7560 1.7870 -1.6720 H 0 0 0 0 0 0 0 0 0 0 0 0
5.1280 3.8690 -3.7370 H 0 0 0 0 0 0 0 0 0 0 0 0
3.6040 3.7320 -2.8280 H 0 0 0 0 0 0 0 0 0 0 0 0
5.1320 4.0280 -1.9650 H 0 0 0 0 0 0 0 0 0 0 0 0
4.4740 0.1510 -3.5930 H 0 0 0 0 0 0 0 0 0 0 0 0
3.4240 2.8510 -4.8190 H 0 0 0 0 0 0 0 0 0 0 0 0
2.8420 -0.0990 -5.2940 H 0 0 0 0 0 0 0 0 0 0 0 0
1.8440 2.6280 -6.5350 H 0 0 0 0 0 0 0 0 0 0 0 0
1.1250 -0.3040 -6.9730 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.0570 2.3960 -7.9020 H 0 0 0 0 0 0 0 0 0 0 0 0
1.6870 1.8180 -9.5570 H 0 0 0 0 0 0 0 0 0 0 0 0
0.9600 0.2070 -9.7670 H 0 0 0 0 0 0 0 0 0 0 0 0
0.0650 1.6600 -10.2710 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.9490 -0.5100 -8.0980 H 0 0 0 0 0 0 0 0 0 0 0 0
-1.7640 0.7950 -8.9950 H 0 0 0 0 0 0 0 0 0 0 0 0
-1.2920 1.0940 -6.0220 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.3510 2.4590 -6.2340 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.1010 3.0090 -7.3750 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.4640 2.0230 -7.9560 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.4560 0.7450 -5.3670 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.1280 1.7100 -3.1220 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.3010 1.9140 -4.8820 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.7380 1.9770 -3.8330 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.9720 -2.2470 -5.3190 H 0 0 0 0 0 0 0 0 0 0 0 0
-6.2150 -2.1740 -5.7980 H 0 0 0 0 0 0 0 0 0 0 0 0
-6.3130 -0.7370 -2.0990 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.7550 0.4950 -3.2540 H 0 0 0 0 0 0 0 0 0 0 0 0
-7.0220 -0.6610 -3.7300 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.7640 -3.0400 -3.3040 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.8960 -2.1880 -1.2300 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.5460 -4.5940 -1.6570 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.8190 -4.4140 -1.2660 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.0480 -4.1610 -0.0040 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.7370 -3.1570 0.5320 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.2360 -2.6120 1.3210 H 0 0 0 0 0 0 0 0 0 0 0 0
-1.9770 -0.6780 0.7470 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.2160 -0.8050 2.8260 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.8430 0.8690 0.3030 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.0980 -0.3600 0.5880 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.1060 1.1530 1.5250 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.2630 1.4410 2.1600 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.1090 0.3560 3.5630 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.5500 2.0930 2.9870 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.6870 0.4830 5.4330 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.9930 0.0410 4.3070 H 0 0 0 0 0 0 0 0 0 0 0 0
-6.1890 2.1380 4.8720 H 0 0 0 0 0 0 0 0 0 0 0 0
-7.8610 0.9620 7.5590 H 0 0 0 0 0 0 0 0 0 0 0 0
-7.2520 2.5670 7.0890 H 0 0 0 0 0 0 0 0 0 0 0 0
-7.9180 1.4710 5.8550 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.9010 2.5270 6.8540 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.0560 4.4560 7.2970 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.4580 4.5790 5.5340 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.7640 4.1360 4.4080 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.2610 2.4820 4.9690 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.5520 3.4630 3.5130 H 0 0 0 0 0 0 0 0 0 0 0 0
6.8470 -3.0190 -1.2330 H 0 0 0 0 0 0 0 0 0 0 0 0
7.5290 -0.6580 -1.5530 H 0 0 0 0 0 0 0 0 0 0 0 0
3.0730 -3.7090 -0.6420 H 0 0 0 0 0 0 0 0 0 0 0 0
4.3200 -4.4850 -1.6480 H 0 0 0 0 0 0 0 0 0 0 0 0
4.5660 -4.3230 0.1070 H 0 0 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0
1 52 1 0 0 0 0
2 3 2 0 0 0 0
2 4 1 0 0 0 0
4 5 1 0 0 0 0
4 9 1 0 0 0 0
4 70 1 0 0 0 0
5 6 1 0 0 0 0
5 71 1 0 0 0 0
5 72 1 0 0 0 0
6 7 1 0 0 0 0
6 73 1 0 0 0 0
6 74 1 0 0 0 0
7 8 1 0 0 0 0
7 75 1 0 0 0 0
7 76 1 0 0 0 0
8 9 1 0 0 0 0
8 77 1 0 0 0 0
8 78 1 0 0 0 0
9 10 1 0 0 0 0
10 11 2 0 0 0 0
10 12 1 0 0 0 0
12 13 2 0 0 0 0
12 14 1 0 0 0 0
14 15 1 0 0 0 0
14 19 1 0 0 0 0
14 20 1 0 0 0 0
15 16 1 0 0 0 0
15 21 1 0 0 0 0
15 79 1 0 0 0 0
16 17 1 0 0 0 0
16 80 1 0 0 0 0
16 81 1 0 0 0 0
17 18 1 0 0 0 0
17 82 1 0 0 0 0
17 83 1 0 0 0 0
18 19 1 0 0 0 0
18 22 1 0 0 0 0
18 84 1 0 0 0 0
20 85 1 0 0 0 0
21 86 1 0 0 0 0
21 87 1 0 0 0 0
21 88 1 0 0 0 0
22 23 1 0 0 0 0
22 89 1 0 0 0 0
22 90 1 0 0 0 0
23 24 1 0 0 0 0
23 25 1 0 0 0 0
23 91 1 0 0 0 0
24 67 2 0 0 0 0
24 69 1 0 0 0 0
25 26 1 0 0 0 0
25 27 2 0 0 0 0
26 92 1 0 0 0 0
26 93 1 0 0 0 0
26 94 1 0 0 0 0
27 28 1 0 0 0 0
27 95 1 0 0 0 0
28 29 2 0 0 0 0
28 96 1 0 0 0 0
29 30 1 0 0 0 0
29 97 1 0 0 0 0
30 31 2 0 0 0 0
30 98 1 0 0 0 0
31 32 1 0 0 0 0
31 99 1 0 0 0 0
32 33 1 0 0 0 0
32 34 1 0 0 0 0
32100 1 0 0 0 0
33101 1 0 0 0 0
33102 1 0 0 0 0
33103 1 0 0 0 0
34 35 1 0 0 0 0
34104 1 0 0 0 0
34105 1 0 0 0 0
35 36 1 0 0 0 0
35 37 1 0 0 0 0
35106 1 0 0 0 0
36107 1 0 0 0 0
36108 1 0 0 0 0
36109 1 0 0 0 0
37 38 2 0 0 0 0
37 39 1 0 0 0 0
39 40 1 0 0 0 0
39 42 1 0 0 0 0
39110 1 0 0 0 0
40 41 1 0 0 0 0
41111 1 0 0 0 0
41112 1 0 0 0 0
41113 1 0 0 0 0
42 43 1 0 0 0 0
42 44 1 0 0 0 0
42114 1 0 0 0 0
43115 1 0 0 0 0
44 45 1 0 0 0 0
44 46 2 0 0 0 0
45116 1 0 0 0 0
45117 1 0 0 0 0
45118 1 0 0 0 0
46 47 1 0 0 0 0
46119 1 0 0 0 0
47 48 1 0 0 0 0
47 49 1 0 0 0 0
47120 1 0 0 0 0
48121 1 0 0 0 0
48122 1 0 0 0 0
48123 1 0 0 0 0
49 50 2 0 0 0 0
49 51 1 0 0 0 0
51 52 1 0 0 0 0
51124 1 0 0 0 0
51125 1 0 0 0 0
52 53 1 0 0 0 0
52126 1 0 0 0 0
53 54 1 0 0 0 0
53 55 1 0 0 0 0
53127 1 0 0 0 0
54128 1 0 0 0 0
54129 1 0 0 0 0
54130 1 0 0 0 0
55 56 1 0 0 0 0
55131 1 0 0 0 0
55132 1 0 0 0 0
56 57 1 0 0 0 0
56 64 1 0 0 0 0
56133 1 0 0 0 0
57 58 1 0 0 0 0
57134 1 0 0 0 0
57135 1 0 0 0 0
58 59 1 0 0 0 0
58 61 1 0 0 0 0
58136 1 0 0 0 0
59 60 1 0 0 0 0
60137 1 0 0 0 0
60138 1 0 0 0 0
60139 1 0 0 0 0
61 62 1 0 0 0 0
61 63 1 0 0 0 0
61140 1 0 0 0 0
62141 1 0 0 0 0
63 64 1 0 0 0 0
63142 1 0 0 0 0
63143 1 0 0 0 0
64144 1 0 0 0 0
64145 1 0 0 0 0
65 66 2 0 0 0 0
65 68 1 0 0 0 0
65 69 1 0 0 0 0
66 67 1 0 0 0 0
66146 1 0 0 0 0
67147 1 0 0 0 0
68148 1 0 0 0 0
68149 1 0 0 0 0
68150 1 0 0 0 0
M END
> <OPENEYE_ISO_SMILES>
Cc1ccc(s1)[C@@H]\2C[C@@H]3CC[C@H]([C@@](O3)(C(=O)C(=O)N4CCCC[C@H]4C(=O)O[C@@H](CC(=O)[C@@H](/C=C(/[C@H]([C@H](C(=O)[C@@H](C[C@@H](/C=C/C=C/C=C2\C)C)C)OC)O)\C)C)[C@H](C)C[C@@H]5CC[C@H]([C@@H](C5)OC)O)O)C
> <OPENEYE_INCHI>
InChI=1S/C55H81NO12S/c1-32-16-12-11-13-17-33(2)42(48-24-20-39(8)69-48)30-41-22-19-38(7)55(64,68-41)52(61)53(62)56-25-15-14-18-43(56)54(63)67-46(35(4)28-40-21-23-44(57)47(29-40)65-9)31-45(58)34(3)27-37(6)50(60)51(66-10)49(59)36(5)26-32/h11-13,16-17,20,24,27,32,34-36,38,40-44,46-47,50-51,57,60,64H,14-15,18-19,21-23,25-26,28-31H2,1-10H3/b13-11+,16-12+,33-17+,37-27+/t32-,34-,35-,36-,38-,40+,41+,42-,43+,44-,46+,47-,50-,51+,55-/m1/s1
> <OPENEYE_INCHIKEY>
SDSGJAIFUCCAOV-MSLSVLDMSA-N
> <FORMULA>
C55H81NO12S
$$$$

View file

@ -0,0 +1,14 @@
defaults:
- base
job_name: "3fap"
protein_inputs:
A:
fasta_file: examples/protein/3fap_A.fasta
B:
fasta_file: examples/protein/3fap_B.fasta
sm_inputs:
C:
input: examples/small_molecule/ARD_ideal.sdf
input_type: "sdf"

View file

@ -28,6 +28,10 @@ class RawInputData:
def query_sequence(self): def query_sequence(self):
return self.msa[0] return self.msa[0]
def sequence_string(self):
three_letter_sequence = [ChemData().num2aa[num] for num in self.query_sequence()]
return "".join([ChemData().aa_321[three] for three in three_letter_sequence])
def is_atom(self): def is_atom(self):
return is_atom(self.query_sequence()) return is_atom(self.query_sequence())

View file

@ -744,7 +744,7 @@ def load_minimal_multi_msa(hash_list, taxid_list, Ls, params):
return a3m_out, hashes_out, Ls_out return a3m_out, hashes_out, Ls_out
def expand_multi_msa(a3m, hashes_in, hashes_out, Ls_in, Ls_out, params): def expand_multi_msa(a3m, hashes_in, hashes_out, Ls_in, Ls_out):
"""Expands a multi-MSA of unique chains into an MSA of a """Expands a multi-MSA of unique chains into an MSA of a
hetero-homo-oligomer in which some chains appear more than once. The query hetero-homo-oligomer in which some chains appear more than once. The query
sequences (1st sequence of MSA) are concatenated directly along the sequences (1st sequence of MSA) are concatenated directly along the

View file

@ -1,6 +1,7 @@
import torch import torch
from hashlib import md5
from rf2aa.data.data_loader_utils import merge_a3m_hetero, merge_a3m_homo, merge_hetero_templates, get_term_feats from rf2aa.data.data_loader_utils import merge_a3m_hetero, merge_a3m_homo, merge_hetero_templates, get_term_feats, join_msas_by_taxid, expand_multi_msa
from rf2aa.data.data_loader import RawInputData from rf2aa.data.data_loader import RawInputData
from rf2aa.util import center_and_realign_missing, same_chain_from_bond_feats, random_rot_trans, idx_from_Ls from rf2aa.util import center_and_realign_missing, same_chain_from_bond_feats, random_rot_trans, idx_from_Ls
@ -18,7 +19,71 @@ def merge_protein_inputs(protein_inputs, deterministic: bool = False):
# handle merging MSAs and such # handle merging MSAs and such
# first determine which sequence are identical, then which one have mergeable MSAs # first determine which sequence are identical, then which one have mergeable MSAs
# then cat the templates, other feats # then cat the templates, other feats
pass else:
a3m_list = [
{"msa": input.msa,
"ins": input.ins,
"taxid": input.taxids
}
for input in protein_inputs.values()
]
hash_list = [md5(input.sequence_string().encode()).hexdigest() for input in protein_inputs.values()]
lengths_list = [input.length() for input in protein_inputs.values()]
seen = set()
unique_indices = []
for idx, hash in enumerate(hash_list):
if hash not in seen:
unique_indices.append(idx)
seen.add(hash)
unique_a3m = [a3m for i, a3m in enumerate(a3m_list) if i in unique_indices ]
unique_hashes = [value for index, value in enumerate(hash_list) if index in unique_indices]
unique_lengths_list = [value for index, value in enumerate(lengths_list) if index in unique_indices]
if len(unique_a3m) >1:
a3m_out = unique_a3m[0]
for i in range(1, len(unique_a3m)):
a3m_out = join_msas_by_taxid(a3m_out, a3m_list[i])
a3m_out = expand_multi_msa(a3m_out, unique_hashes, hash_list, unique_lengths_list, lengths_list)
else:
a3m = unique_a3m[0]
msa, ins = a3m["msa"], a3m["ins"]
a3m_out = merge_a3m_homo(msa, ins, len(hash_list))
# merge templates
max_template_dim = max([input.xyz_t.shape[0] for input in protein_inputs.values()])
xyz_t_list = [input.xyz_t for input in protein_inputs.values()]
mask_t_list = [input.mask_t for input in protein_inputs.values()]
t1d_list = [input.t1d for input in protein_inputs.values()]
ids = ["inference"] * len(t1d_list)
xyz_t, t1d, mask_t, _ = merge_hetero_templates(xyz_t_list, t1d_list, mask_t_list, ids, lengths_list, deterministic=deterministic)
atom_frames = torch.zeros(0,3,2)
chirals = torch.zeros(0,5)
L_total = sum(lengths_list)
bond_feats = torch.zeros((L_total, L_total)).long()
offset = 0
for bf in [input.bond_feats for input in protein_inputs.values()]:
L = bf.shape[0]
bond_feats[offset:offset+L, offset:offset+L] = bf
offset += L
chain_lengths = list(zip(protein_inputs.keys(), lengths_list))
merged_input = RawInputData(
a3m_out["msa"],
a3m_out["ins"],
bond_feats,
xyz_t[:max_template_dim],
mask_t[:max_template_dim],
t1d[:max_template_dim],
chirals,
atom_frames,
taxids=None
)
return merged_input, chain_lengths
def merge_na_inputs(na_inputs): def merge_na_inputs(na_inputs):
# should just be trivially catting features # should just be trivially catting features
@ -101,14 +166,6 @@ def merge_all(
deterministic: bool = False, deterministic: bool = False,
): ):
#protein_lengths = [protein_input.length() for protein_input in protein_inputs.values()]
#na_lengths = [na_input.length() for na_input in na_inputs.values()]
#sm_lengths = [sm_input.length() for sm_input in sm_inputs.values()]
#all_lengths = protein_lengths + na_lengths + sm_lengths
#term_info = get_term_feats(all_lengths)
#term_info[sum(protein_lengths):, :] = 0
protein_inputs, protein_chain_lengths = merge_protein_inputs(protein_inputs, deterministic=deterministic) protein_inputs, protein_chain_lengths = merge_protein_inputs(protein_inputs, deterministic=deterministic)
na_inputs, na_chain_lengths = merge_na_inputs(na_inputs) na_inputs, na_chain_lengths = merge_na_inputs(na_inputs)

View file

@ -414,18 +414,21 @@ def parse_a3m(filename, maxseq=8000, paired=False):
else: else:
fstream = open(filename, 'r') fstream = open(filename, 'r')
for line in fstream: for i, line in enumerate(fstream):
# skip labels # skip labels
if line[0] == '>': if line[0] == '>':
if paired: # paired MSAs only have a TAXID in the fasta header if paired: # paired MSAs only have a TAXID in the fasta header
taxIDs.append(line[1:].strip()) taxIDs.append(line[1:].strip())
else: # unpaired MSAs have all the metadata so use regex to pull out TAXID else: # unpaired MSAs have all the metadata so use regex to pull out TAXID
if i == 0:
taxIDs.append("query")
else:
match = re.search( r'TaxID=(\d+)', line) match = re.search( r'TaxID=(\d+)', line)
if match: if match:
taxIDs.append(match.group(1)) taxIDs.append(match.group(1))
else: else:
taxIDs.append("query") # query sequence taxIDs.append("") # query sequence
continue continue
# remove right whitespaces # remove right whitespaces

View file

@ -8,11 +8,12 @@ import subprocess
def make_msa( def make_msa(
fasta_file, fasta_file,
chain,
model_runner model_runner
): ):
out_dir_base = Path(model_runner.config.output_path) out_dir_base = Path(model_runner.config.output_path)
hash = model_runner.config.job_name hash = model_runner.config.job_name
out_dir = out_dir_base / hash out_dir = out_dir_base / hash / chain
out_dir.mkdir(parents=True, exist_ok=True) out_dir.mkdir(parents=True, exist_ok=True)
command = model_runner.config.database_params.command command = model_runner.config.database_params.command

View file

@ -88,6 +88,6 @@ def load_protein(msa_file, hhr_fn, atab_fn, model_runner):
taxids=taxIDs, taxids=taxIDs,
) )
def generate_msa_and_load_protein(fasta_file, model_runner): def generate_msa_and_load_protein(fasta_file, chain, model_runner):
msa_file, hhr_file, atab_file = make_msa(fasta_file, model_runner) msa_file, hhr_file, atab_file = make_msa(fasta_file, chain, model_runner)
return load_protein(str(msa_file), str(hhr_file), str(atab_file), model_runner) return load_protein(str(msa_file), str(hhr_file), str(atab_file), model_runner)

View file

@ -45,6 +45,7 @@ class ModelRunner:
chains.append(chain) chains.append(chain)
protein_input = generate_msa_and_load_protein( protein_input = generate_msa_and_load_protein(
self.config.protein_inputs[chain]["fasta_file"], self.config.protein_inputs[chain]["fasta_file"],
chain,
self self
) )
protein_inputs[chain] = protein_input protein_inputs[chain] = protein_input