add in protein multimer inference

This commit is contained in:
Rohith Krishna 2024-03-05 16:45:27 -08:00
parent f87f5b8cdf
commit 097ad85d4e
11 changed files with 426 additions and 20 deletions

View file

@ -0,0 +1,2 @@
>3FAP_1|Chain A|FK506-BINDING PROTEIN|Homo sapiens (9606)
GVQVETISPGDGRTFPKRGQTCVVHYTGMLEDGKKFDSSRDRNKPFKFMLGKQEVIRGWEEGVAQMSVGQRAKLTISPDYAYGATGHPGIIPPHATLVFDVELLKLE

View file

@ -0,0 +1,2 @@
>3FAP_2|Chain B|FKBP12-RAPAMYCIN ASSOCIATED PROTEIN|Homo sapiens (9606)
VAILWHEMWHEGLEEASRLYFGERNVKGMFEVLEPLHAMMERGPQTLKETSFNQAYGRDLMEAQEWCRKYMKSGNVKDLTQAWDLYYHVFRRIS

View file

@ -0,0 +1,322 @@
ARD
-OEChem-02232415173D
150154 0 1 0 0 0 0 0999 V2000
-1.7790 -1.8400 2.4660 O 0 0 0 0 0 0 0 0 0 0 0 0
-0.5750 -1.3280 2.8030 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.1380 -0.4090 2.1630 O 0 0 0 0 0 0 0 0 0 0 0 0
0.1530 -1.9370 3.9570 C 0 0 2 0 0 0 0 0 0 0 0 0
-0.5340 -1.5700 5.2770 C 0 0 0 0 0 0 0 0 0 0 0 0
0.1240 -2.3500 6.4190 C 0 0 0 0 0 0 0 0 0 0 0 0
1.6170 -2.0190 6.4880 C 0 0 0 0 0 0 0 0 0 0 0 0
2.2820 -2.4020 5.1570 C 0 0 0 0 0 0 0 0 0 0 0 0
1.5790 -1.6200 4.1030 N 0 0 0 0 0 0 0 0 0 0 0 0
2.2890 -0.7310 3.4090 C 0 0 0 0 0 0 0 0 0 0 0 0
1.7590 0.2990 3.0480 O 0 0 0 0 0 0 0 0 0 0 0 0
3.7290 -0.8580 3.1350 C 0 0 0 0 0 0 0 0 0 0 0 0
4.1990 -1.9680 3.1760 O 0 0 0 0 0 0 0 0 0 0 0 0
4.6500 0.2750 2.7820 C 0 0 1 0 0 0 0 0 0 0 0 0
6.0730 -0.0990 3.2010 C 0 0 1 0 0 0 0 0 0 0 0 0
7.0540 1.0150 2.8270 C 0 0 0 0 0 0 0 0 0 0 0 0
6.9100 1.2850 1.3170 C 0 0 0 0 0 0 0 0 0 0 0 0
5.4410 1.6400 1.0770 C 0 0 2 0 0 0 0 0 0 0 0 0
4.6020 0.5430 1.3850 O 0 0 0 0 0 0 0 0 0 0 0 0
4.2490 1.4380 3.5170 O 0 0 0 0 0 0 0 0 0 0 0 0
6.1240 -0.3020 4.7200 C 0 0 0 0 0 0 0 0 0 0 0 0
5.2000 2.2840 -0.2620 C 0 0 0 0 0 0 0 0 0 0 0 0
5.7060 1.5450 -1.4960 C 0 0 1 0 0 0 0 0 0 0 0 0
5.5470 0.0580 -1.3360 C 0 0 0 0 0 0 0 0 0 0 0 0
4.8870 2.0220 -2.6780 C 0 0 0 0 0 0 0 0 0 0 0 0
4.6730 3.5180 -2.8110 C 0 0 0 0 0 0 0 0 0 0 0 0
4.3300 1.2230 -3.5690 C 0 0 0 0 0 0 0 0 0 0 0 0
3.4840 1.7930 -4.6240 C 0 0 0 0 0 0 0 0 0 0 0 0
2.7570 0.9680 -5.4040 C 0 0 0 0 0 0 0 0 0 0 0 0
1.8610 1.5580 -6.3980 C 0 0 0 0 0 0 0 0 0 0 0 0
1.0770 0.7650 -7.1230 C 0 0 0 0 0 0 0 0 0 0 0 0
0.1320 1.3510 -8.1390 C 0 0 1 0 0 0 0 0 0 0 0 0
0.7560 1.2520 -9.5340 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.1760 0.5550 -8.1080 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.9730 0.9210 -6.8550 C 0 0 1 0 0 0 0 0 0 0 0 0
-2.7810 2.1950 -7.1240 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.9200 -0.2000 -6.5090 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.9910 -1.1750 -7.2160 O 0 0 0 0 0 0 0 0 0 0 0 0
-3.7750 -0.1010 -5.2690 C 0 0 2 0 0 0 0 0 0 0 0 0
-2.9350 0.0970 -4.1300 O 0 0 0 0 0 0 0 0 0 0 0 0
-2.7650 1.5080 -3.9830 C 0 0 0 0 0 0 0 0 0 0 0 0
-4.5910 -1.3810 -5.0870 C 0 0 2 0 0 0 0 0 0 0 0 0
-5.7300 -1.3510 -5.9480 O 0 0 0 0 0 0 0 0 0 0 0 0
-5.0540 -1.4780 -3.6520 C 0 0 0 0 0 0 0 0 0 0 0 0
-6.1090 -0.5300 -3.1490 C 0 0 0 0 0 0 0 0 0 0 0 0
-4.5170 -2.3830 -2.8800 C 0 0 0 0 0 0 0 0 0 0 0 0
-4.8930 -2.5500 -1.4300 C 0 0 1 0 0 0 0 0 0 0 0 0
-4.8210 -4.0370 -1.0630 C 0 0 0 0 0 0 0 0 0 0 0 0
-3.8900 -1.7740 -0.6090 C 0 0 0 0 0 0 0 0 0 0 0 0
-3.4830 -0.7100 -1.0220 O 0 0 0 0 0 0 0 0 0 0 0 0
-3.3920 -2.3030 0.7040 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.6090 -1.2170 1.4520 C 0 0 1 0 0 0 0 0 0 0 0 0
-3.5900 -0.2550 2.1240 C 0 0 2 0 0 0 0 0 0 0 0 0
-4.4730 0.3980 1.0580 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.8110 0.8260 2.8740 C 0 0 0 0 0 0 0 0 0 0 0 0
-3.7860 1.7040 3.6610 C 0 0 1 0 0 0 0 0 0 0 0 0
-4.4510 0.8710 4.7590 C 0 0 0 0 0 0 0 0 0 0 0 0
-5.4260 1.7490 5.5460 C 0 0 1 0 0 0 0 0 0 0 0 0
-6.0460 0.9710 6.5720 O 0 0 0 0 0 0 0 0 0 0 0 0
-7.3460 1.5260 6.7810 C 0 0 0 0 0 0 0 0 0 0 0 0
-4.6640 2.9150 6.1800 C 0 0 2 0 0 0 0 0 0 0 0 0
-5.5750 3.7350 6.9150 O 0 0 0 0 0 0 0 0 0 0 0 0
-4.0000 3.7480 5.0820 C 0 0 0 0 0 0 0 0 0 0 0 0
-3.0250 2.8700 4.2950 C 0 0 0 0 0 0 0 0 0 0 0 0
4.8030 -2.5110 -0.9950 C 0 0 0 0 0 0 0 0 0 0 0 0
6.1120 -2.2280 -1.2040 C 0 0 0 0 0 0 0 0 0 0 0 0
6.4930 -0.9110 -1.3840 C 0 0 0 0 0 0 0 0 0 0 0 0
4.1450 -3.8500 -0.7790 C 0 0 0 0 0 0 0 0 0 0 0 0
4.0620 -0.8880 -1.0410 S 0 0 0 0 0 0 0 0 0 0 0 0
0.0770 -3.0380 3.8490 H 0 0 0 0 0 0 0 0 0 0 0 0
-1.5890 -1.8360 5.2140 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.4290 -0.5030 5.4520 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.0020 -3.4180 6.2480 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.3530 -2.0760 7.3610 H 0 0 0 0 0 0 0 0 0 0 0 0
2.0800 -2.5780 7.3010 H 0 0 0 0 0 0 0 0 0 0 0 0
1.7450 -0.9500 6.6630 H 0 0 0 0 0 0 0 0 0 0 0 0
2.1540 -3.4630 4.9650 H 0 0 0 0 0 0 0 0 0 0 0 0
3.3350 -2.1320 5.1760 H 0 0 0 0 0 0 0 0 0 0 0 0
6.3710 -1.0230 2.7070 H 0 0 0 0 0 0 0 0 0 0 0 0
8.0740 0.6960 3.0480 H 0 0 0 0 0 0 0 0 0 0 0 0
6.8170 1.9170 3.3880 H 0 0 0 0 0 0 0 0 0 0 0 0
7.1900 0.3900 0.7630 H 0 0 0 0 0 0 0 0 0 0 0 0
7.5510 2.1180 1.0280 H 0 0 0 0 0 0 0 0 0 0 0 0
5.2120 2.4360 1.8310 H 0 0 0 0 0 0 0 0 0 0 0 0
3.3500 1.6510 3.2330 H 0 0 0 0 0 0 0 0 0 0 0 0
7.1360 -0.5820 5.0150 H 0 0 0 0 0 0 0 0 0 0 0 0
5.8430 0.6230 5.2200 H 0 0 0 0 0 0 0 0 0 0 0 0
5.4310 -1.0940 5.0040 H 0 0 0 0 0 0 0 0 0 0 0 0
5.6660 3.2750 -0.2390 H 0 0 0 0 0 0 0 0 0 0 0 0
4.1180 2.4310 -0.3660 H 0 0 0 0 0 0 0 0 0 0 0 0
6.7560 1.7870 -1.6720 H 0 0 0 0 0 0 0 0 0 0 0 0
5.1280 3.8690 -3.7370 H 0 0 0 0 0 0 0 0 0 0 0 0
3.6040 3.7320 -2.8280 H 0 0 0 0 0 0 0 0 0 0 0 0
5.1320 4.0280 -1.9650 H 0 0 0 0 0 0 0 0 0 0 0 0
4.4740 0.1510 -3.5930 H 0 0 0 0 0 0 0 0 0 0 0 0
3.4240 2.8510 -4.8190 H 0 0 0 0 0 0 0 0 0 0 0 0
2.8420 -0.0990 -5.2940 H 0 0 0 0 0 0 0 0 0 0 0 0
1.8440 2.6280 -6.5350 H 0 0 0 0 0 0 0 0 0 0 0 0
1.1250 -0.3040 -6.9730 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.0570 2.3960 -7.9020 H 0 0 0 0 0 0 0 0 0 0 0 0
1.6870 1.8180 -9.5570 H 0 0 0 0 0 0 0 0 0 0 0 0
0.9600 0.2070 -9.7670 H 0 0 0 0 0 0 0 0 0 0 0 0
0.0650 1.6600 -10.2710 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.9490 -0.5100 -8.0980 H 0 0 0 0 0 0 0 0 0 0 0 0
-1.7640 0.7950 -8.9950 H 0 0 0 0 0 0 0 0 0 0 0 0
-1.2920 1.0940 -6.0220 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.3510 2.4590 -6.2340 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.1010 3.0090 -7.3750 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.4640 2.0230 -7.9560 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.4560 0.7450 -5.3670 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.1280 1.7100 -3.1220 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.3010 1.9140 -4.8820 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.7380 1.9770 -3.8330 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.9720 -2.2470 -5.3190 H 0 0 0 0 0 0 0 0 0 0 0 0
-6.2150 -2.1740 -5.7980 H 0 0 0 0 0 0 0 0 0 0 0 0
-6.3130 -0.7370 -2.0990 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.7550 0.4950 -3.2540 H 0 0 0 0 0 0 0 0 0 0 0 0
-7.0220 -0.6610 -3.7300 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.7640 -3.0400 -3.3040 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.8960 -2.1880 -1.2300 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.5460 -4.5940 -1.6570 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.8190 -4.4140 -1.2660 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.0480 -4.1610 -0.0040 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.7370 -3.1570 0.5320 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.2360 -2.6120 1.3210 H 0 0 0 0 0 0 0 0 0 0 0 0
-1.9770 -0.6780 0.7470 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.2160 -0.8050 2.8260 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.8430 0.8690 0.3030 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.0980 -0.3600 0.5880 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.1060 1.1530 1.5250 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.2630 1.4410 2.1600 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.1090 0.3560 3.5630 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.5500 2.0930 2.9870 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.6870 0.4830 5.4330 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.9930 0.0410 4.3070 H 0 0 0 0 0 0 0 0 0 0 0 0
-6.1890 2.1380 4.8720 H 0 0 0 0 0 0 0 0 0 0 0 0
-7.8610 0.9620 7.5590 H 0 0 0 0 0 0 0 0 0 0 0 0
-7.2520 2.5670 7.0890 H 0 0 0 0 0 0 0 0 0 0 0 0
-7.9180 1.4710 5.8550 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.9010 2.5270 6.8540 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.0560 4.4560 7.2970 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.4580 4.5790 5.5340 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.7640 4.1360 4.4080 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.2610 2.4820 4.9690 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.5520 3.4630 3.5130 H 0 0 0 0 0 0 0 0 0 0 0 0
6.8470 -3.0190 -1.2330 H 0 0 0 0 0 0 0 0 0 0 0 0
7.5290 -0.6580 -1.5530 H 0 0 0 0 0 0 0 0 0 0 0 0
3.0730 -3.7090 -0.6420 H 0 0 0 0 0 0 0 0 0 0 0 0
4.3200 -4.4850 -1.6480 H 0 0 0 0 0 0 0 0 0 0 0 0
4.5660 -4.3230 0.1070 H 0 0 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0
1 52 1 0 0 0 0
2 3 2 0 0 0 0
2 4 1 0 0 0 0
4 5 1 0 0 0 0
4 9 1 0 0 0 0
4 70 1 0 0 0 0
5 6 1 0 0 0 0
5 71 1 0 0 0 0
5 72 1 0 0 0 0
6 7 1 0 0 0 0
6 73 1 0 0 0 0
6 74 1 0 0 0 0
7 8 1 0 0 0 0
7 75 1 0 0 0 0
7 76 1 0 0 0 0
8 9 1 0 0 0 0
8 77 1 0 0 0 0
8 78 1 0 0 0 0
9 10 1 0 0 0 0
10 11 2 0 0 0 0
10 12 1 0 0 0 0
12 13 2 0 0 0 0
12 14 1 0 0 0 0
14 15 1 0 0 0 0
14 19 1 0 0 0 0
14 20 1 0 0 0 0
15 16 1 0 0 0 0
15 21 1 0 0 0 0
15 79 1 0 0 0 0
16 17 1 0 0 0 0
16 80 1 0 0 0 0
16 81 1 0 0 0 0
17 18 1 0 0 0 0
17 82 1 0 0 0 0
17 83 1 0 0 0 0
18 19 1 0 0 0 0
18 22 1 0 0 0 0
18 84 1 0 0 0 0
20 85 1 0 0 0 0
21 86 1 0 0 0 0
21 87 1 0 0 0 0
21 88 1 0 0 0 0
22 23 1 0 0 0 0
22 89 1 0 0 0 0
22 90 1 0 0 0 0
23 24 1 0 0 0 0
23 25 1 0 0 0 0
23 91 1 0 0 0 0
24 67 2 0 0 0 0
24 69 1 0 0 0 0
25 26 1 0 0 0 0
25 27 2 0 0 0 0
26 92 1 0 0 0 0
26 93 1 0 0 0 0
26 94 1 0 0 0 0
27 28 1 0 0 0 0
27 95 1 0 0 0 0
28 29 2 0 0 0 0
28 96 1 0 0 0 0
29 30 1 0 0 0 0
29 97 1 0 0 0 0
30 31 2 0 0 0 0
30 98 1 0 0 0 0
31 32 1 0 0 0 0
31 99 1 0 0 0 0
32 33 1 0 0 0 0
32 34 1 0 0 0 0
32100 1 0 0 0 0
33101 1 0 0 0 0
33102 1 0 0 0 0
33103 1 0 0 0 0
34 35 1 0 0 0 0
34104 1 0 0 0 0
34105 1 0 0 0 0
35 36 1 0 0 0 0
35 37 1 0 0 0 0
35106 1 0 0 0 0
36107 1 0 0 0 0
36108 1 0 0 0 0
36109 1 0 0 0 0
37 38 2 0 0 0 0
37 39 1 0 0 0 0
39 40 1 0 0 0 0
39 42 1 0 0 0 0
39110 1 0 0 0 0
40 41 1 0 0 0 0
41111 1 0 0 0 0
41112 1 0 0 0 0
41113 1 0 0 0 0
42 43 1 0 0 0 0
42 44 1 0 0 0 0
42114 1 0 0 0 0
43115 1 0 0 0 0
44 45 1 0 0 0 0
44 46 2 0 0 0 0
45116 1 0 0 0 0
45117 1 0 0 0 0
45118 1 0 0 0 0
46 47 1 0 0 0 0
46119 1 0 0 0 0
47 48 1 0 0 0 0
47 49 1 0 0 0 0
47120 1 0 0 0 0
48121 1 0 0 0 0
48122 1 0 0 0 0
48123 1 0 0 0 0
49 50 2 0 0 0 0
49 51 1 0 0 0 0
51 52 1 0 0 0 0
51124 1 0 0 0 0
51125 1 0 0 0 0
52 53 1 0 0 0 0
52126 1 0 0 0 0
53 54 1 0 0 0 0
53 55 1 0 0 0 0
53127 1 0 0 0 0
54128 1 0 0 0 0
54129 1 0 0 0 0
54130 1 0 0 0 0
55 56 1 0 0 0 0
55131 1 0 0 0 0
55132 1 0 0 0 0
56 57 1 0 0 0 0
56 64 1 0 0 0 0
56133 1 0 0 0 0
57 58 1 0 0 0 0
57134 1 0 0 0 0
57135 1 0 0 0 0
58 59 1 0 0 0 0
58 61 1 0 0 0 0
58136 1 0 0 0 0
59 60 1 0 0 0 0
60137 1 0 0 0 0
60138 1 0 0 0 0
60139 1 0 0 0 0
61 62 1 0 0 0 0
61 63 1 0 0 0 0
61140 1 0 0 0 0
62141 1 0 0 0 0
63 64 1 0 0 0 0
63142 1 0 0 0 0
63143 1 0 0 0 0
64144 1 0 0 0 0
64145 1 0 0 0 0
65 66 2 0 0 0 0
65 68 1 0 0 0 0
65 69 1 0 0 0 0
66 67 1 0 0 0 0
66146 1 0 0 0 0
67147 1 0 0 0 0
68148 1 0 0 0 0
68149 1 0 0 0 0
68150 1 0 0 0 0
M END
> <OPENEYE_ISO_SMILES>
Cc1ccc(s1)[C@@H]\2C[C@@H]3CC[C@H]([C@@](O3)(C(=O)C(=O)N4CCCC[C@H]4C(=O)O[C@@H](CC(=O)[C@@H](/C=C(/[C@H]([C@H](C(=O)[C@@H](C[C@@H](/C=C/C=C/C=C2\C)C)C)OC)O)\C)C)[C@H](C)C[C@@H]5CC[C@H]([C@@H](C5)OC)O)O)C
> <OPENEYE_INCHI>
InChI=1S/C55H81NO12S/c1-32-16-12-11-13-17-33(2)42(48-24-20-39(8)69-48)30-41-22-19-38(7)55(64,68-41)52(61)53(62)56-25-15-14-18-43(56)54(63)67-46(35(4)28-40-21-23-44(57)47(29-40)65-9)31-45(58)34(3)27-37(6)50(60)51(66-10)49(59)36(5)26-32/h11-13,16-17,20,24,27,32,34-36,38,40-44,46-47,50-51,57,60,64H,14-15,18-19,21-23,25-26,28-31H2,1-10H3/b13-11+,16-12+,33-17+,37-27+/t32-,34-,35-,36-,38-,40+,41+,42-,43+,44-,46+,47-,50-,51+,55-/m1/s1
> <OPENEYE_INCHIKEY>
SDSGJAIFUCCAOV-MSLSVLDMSA-N
> <FORMULA>
C55H81NO12S
$$$$

View file

@ -0,0 +1,14 @@
defaults:
- base
job_name: "3fap"
protein_inputs:
A:
fasta_file: examples/protein/3fap_A.fasta
B:
fasta_file: examples/protein/3fap_B.fasta
sm_inputs:
C:
input: examples/small_molecule/ARD_ideal.sdf
input_type: "sdf"

View file

@ -28,6 +28,10 @@ class RawInputData:
def query_sequence(self):
return self.msa[0]
def sequence_string(self):
three_letter_sequence = [ChemData().num2aa[num] for num in self.query_sequence()]
return "".join([ChemData().aa_321[three] for three in three_letter_sequence])
def is_atom(self):
return is_atom(self.query_sequence())

View file

@ -548,7 +548,7 @@ def join_msas_by_taxid(a3mA, a3mB, idx_overlap=None):
# pair sequences
taxids_shared = a3mA['taxid'][np.isin(a3mA['taxid'],a3mB['taxid'])]
i_pairedA, i_pairedB = [], []
for taxid in taxids_shared:
i_match = np.where(a3mA['taxid']==taxid)[0]
i_match_best = torch.argmin(torch.sum(a3mA['msa'][i_match]==a3mA['msa'][0], axis=1))
@ -744,7 +744,7 @@ def load_minimal_multi_msa(hash_list, taxid_list, Ls, params):
return a3m_out, hashes_out, Ls_out
def expand_multi_msa(a3m, hashes_in, hashes_out, Ls_in, Ls_out, params):
def expand_multi_msa(a3m, hashes_in, hashes_out, Ls_in, Ls_out):
"""Expands a multi-MSA of unique chains into an MSA of a
hetero-homo-oligomer in which some chains appear more than once. The query
sequences (1st sequence of MSA) are concatenated directly along the

View file

@ -1,6 +1,7 @@
import torch
from hashlib import md5
from rf2aa.data.data_loader_utils import merge_a3m_hetero, merge_a3m_homo, merge_hetero_templates, get_term_feats
from rf2aa.data.data_loader_utils import merge_a3m_hetero, merge_a3m_homo, merge_hetero_templates, get_term_feats, join_msas_by_taxid, expand_multi_msa
from rf2aa.data.data_loader import RawInputData
from rf2aa.util import center_and_realign_missing, same_chain_from_bond_feats, random_rot_trans, idx_from_Ls
@ -18,7 +19,71 @@ def merge_protein_inputs(protein_inputs, deterministic: bool = False):
# handle merging MSAs and such
# first determine which sequence are identical, then which one have mergeable MSAs
# then cat the templates, other feats
pass
else:
a3m_list = [
{"msa": input.msa,
"ins": input.ins,
"taxid": input.taxids
}
for input in protein_inputs.values()
]
hash_list = [md5(input.sequence_string().encode()).hexdigest() for input in protein_inputs.values()]
lengths_list = [input.length() for input in protein_inputs.values()]
seen = set()
unique_indices = []
for idx, hash in enumerate(hash_list):
if hash not in seen:
unique_indices.append(idx)
seen.add(hash)
unique_a3m = [a3m for i, a3m in enumerate(a3m_list) if i in unique_indices ]
unique_hashes = [value for index, value in enumerate(hash_list) if index in unique_indices]
unique_lengths_list = [value for index, value in enumerate(lengths_list) if index in unique_indices]
if len(unique_a3m) >1:
a3m_out = unique_a3m[0]
for i in range(1, len(unique_a3m)):
a3m_out = join_msas_by_taxid(a3m_out, a3m_list[i])
a3m_out = expand_multi_msa(a3m_out, unique_hashes, hash_list, unique_lengths_list, lengths_list)
else:
a3m = unique_a3m[0]
msa, ins = a3m["msa"], a3m["ins"]
a3m_out = merge_a3m_homo(msa, ins, len(hash_list))
# merge templates
max_template_dim = max([input.xyz_t.shape[0] for input in protein_inputs.values()])
xyz_t_list = [input.xyz_t for input in protein_inputs.values()]
mask_t_list = [input.mask_t for input in protein_inputs.values()]
t1d_list = [input.t1d for input in protein_inputs.values()]
ids = ["inference"] * len(t1d_list)
xyz_t, t1d, mask_t, _ = merge_hetero_templates(xyz_t_list, t1d_list, mask_t_list, ids, lengths_list, deterministic=deterministic)
atom_frames = torch.zeros(0,3,2)
chirals = torch.zeros(0,5)
L_total = sum(lengths_list)
bond_feats = torch.zeros((L_total, L_total)).long()
offset = 0
for bf in [input.bond_feats for input in protein_inputs.values()]:
L = bf.shape[0]
bond_feats[offset:offset+L, offset:offset+L] = bf
offset += L
chain_lengths = list(zip(protein_inputs.keys(), lengths_list))
merged_input = RawInputData(
a3m_out["msa"],
a3m_out["ins"],
bond_feats,
xyz_t[:max_template_dim],
mask_t[:max_template_dim],
t1d[:max_template_dim],
chirals,
atom_frames,
taxids=None
)
return merged_input, chain_lengths
def merge_na_inputs(na_inputs):
# should just be trivially catting features
@ -101,14 +166,6 @@ def merge_all(
deterministic: bool = False,
):
#protein_lengths = [protein_input.length() for protein_input in protein_inputs.values()]
#na_lengths = [na_input.length() for na_input in na_inputs.values()]
#sm_lengths = [sm_input.length() for sm_input in sm_inputs.values()]
#all_lengths = protein_lengths + na_lengths + sm_lengths
#term_info = get_term_feats(all_lengths)
#term_info[sum(protein_lengths):, :] = 0
protein_inputs, protein_chain_lengths = merge_protein_inputs(protein_inputs, deterministic=deterministic)
na_inputs, na_chain_lengths = merge_na_inputs(na_inputs)

View file

@ -414,18 +414,21 @@ def parse_a3m(filename, maxseq=8000, paired=False):
else:
fstream = open(filename, 'r')
for line in fstream:
for i, line in enumerate(fstream):
# skip labels
if line[0] == '>':
if paired: # paired MSAs only have a TAXID in the fasta header
taxIDs.append(line[1:].strip())
else: # unpaired MSAs have all the metadata so use regex to pull out TAXID
match = re.search( r'TaxID=(\d+)', line)
if match:
taxIDs.append(match.group(1))
if i == 0:
taxIDs.append("query")
else:
taxIDs.append("query") # query sequence
match = re.search( r'TaxID=(\d+)', line)
if match:
taxIDs.append(match.group(1))
else:
taxIDs.append("") # query sequence
continue
# remove right whitespaces

View file

@ -8,11 +8,12 @@ import subprocess
def make_msa(
fasta_file,
chain,
model_runner
):
out_dir_base = Path(model_runner.config.output_path)
hash = model_runner.config.job_name
out_dir = out_dir_base / hash
out_dir = out_dir_base / hash / chain
out_dir.mkdir(parents=True, exist_ok=True)
command = model_runner.config.database_params.command

View file

@ -88,6 +88,6 @@ def load_protein(msa_file, hhr_fn, atab_fn, model_runner):
taxids=taxIDs,
)
def generate_msa_and_load_protein(fasta_file, model_runner):
msa_file, hhr_file, atab_file = make_msa(fasta_file, model_runner)
def generate_msa_and_load_protein(fasta_file, chain, model_runner):
msa_file, hhr_file, atab_file = make_msa(fasta_file, chain, model_runner)
return load_protein(str(msa_file), str(hhr_file), str(atab_file), model_runner)

View file

@ -45,6 +45,7 @@ class ModelRunner:
chains.append(chain)
protein_input = generate_msa_and_load_protein(
self.config.protein_inputs[chain]["fasta_file"],
chain,
self
)
protein_inputs[chain] = protein_input