Merge pull request #1 from baker-laboratory/multimer

Multimer
This commit is contained in:
Rohith Krishna 2024-03-05 16:54:01 -08:00 committed by GitHub
commit d96e013a54
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 426 additions and 172 deletions

View file

@ -0,0 +1,2 @@
>3FAP_1|Chain A|FK506-BINDING PROTEIN|Homo sapiens (9606)
GVQVETISPGDGRTFPKRGQTCVVHYTGMLEDGKKFDSSRDRNKPFKFMLGKQEVIRGWEEGVAQMSVGQRAKLTISPDYAYGATGHPGIIPPHATLVFDVELLKLE

View file

@ -0,0 +1,2 @@
>3FAP_2|Chain B|FKBP12-RAPAMYCIN ASSOCIATED PROTEIN|Homo sapiens (9606)
VAILWHEMWHEGLEEASRLYFGERNVKGMFEVLEPLHAMMERGPQTLKETSFNQAYGRDLMEAQEWCRKYMKSGNVKDLTQAWDLYYHVFRRIS

View file

@ -0,0 +1,322 @@
ARD
-OEChem-02232415173D
150154 0 1 0 0 0 0 0999 V2000
-1.7790 -1.8400 2.4660 O 0 0 0 0 0 0 0 0 0 0 0 0
-0.5750 -1.3280 2.8030 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.1380 -0.4090 2.1630 O 0 0 0 0 0 0 0 0 0 0 0 0
0.1530 -1.9370 3.9570 C 0 0 2 0 0 0 0 0 0 0 0 0
-0.5340 -1.5700 5.2770 C 0 0 0 0 0 0 0 0 0 0 0 0
0.1240 -2.3500 6.4190 C 0 0 0 0 0 0 0 0 0 0 0 0
1.6170 -2.0190 6.4880 C 0 0 0 0 0 0 0 0 0 0 0 0
2.2820 -2.4020 5.1570 C 0 0 0 0 0 0 0 0 0 0 0 0
1.5790 -1.6200 4.1030 N 0 0 0 0 0 0 0 0 0 0 0 0
2.2890 -0.7310 3.4090 C 0 0 0 0 0 0 0 0 0 0 0 0
1.7590 0.2990 3.0480 O 0 0 0 0 0 0 0 0 0 0 0 0
3.7290 -0.8580 3.1350 C 0 0 0 0 0 0 0 0 0 0 0 0
4.1990 -1.9680 3.1760 O 0 0 0 0 0 0 0 0 0 0 0 0
4.6500 0.2750 2.7820 C 0 0 1 0 0 0 0 0 0 0 0 0
6.0730 -0.0990 3.2010 C 0 0 1 0 0 0 0 0 0 0 0 0
7.0540 1.0150 2.8270 C 0 0 0 0 0 0 0 0 0 0 0 0
6.9100 1.2850 1.3170 C 0 0 0 0 0 0 0 0 0 0 0 0
5.4410 1.6400 1.0770 C 0 0 2 0 0 0 0 0 0 0 0 0
4.6020 0.5430 1.3850 O 0 0 0 0 0 0 0 0 0 0 0 0
4.2490 1.4380 3.5170 O 0 0 0 0 0 0 0 0 0 0 0 0
6.1240 -0.3020 4.7200 C 0 0 0 0 0 0 0 0 0 0 0 0
5.2000 2.2840 -0.2620 C 0 0 0 0 0 0 0 0 0 0 0 0
5.7060 1.5450 -1.4960 C 0 0 1 0 0 0 0 0 0 0 0 0
5.5470 0.0580 -1.3360 C 0 0 0 0 0 0 0 0 0 0 0 0
4.8870 2.0220 -2.6780 C 0 0 0 0 0 0 0 0 0 0 0 0
4.6730 3.5180 -2.8110 C 0 0 0 0 0 0 0 0 0 0 0 0
4.3300 1.2230 -3.5690 C 0 0 0 0 0 0 0 0 0 0 0 0
3.4840 1.7930 -4.6240 C 0 0 0 0 0 0 0 0 0 0 0 0
2.7570 0.9680 -5.4040 C 0 0 0 0 0 0 0 0 0 0 0 0
1.8610 1.5580 -6.3980 C 0 0 0 0 0 0 0 0 0 0 0 0
1.0770 0.7650 -7.1230 C 0 0 0 0 0 0 0 0 0 0 0 0
0.1320 1.3510 -8.1390 C 0 0 1 0 0 0 0 0 0 0 0 0
0.7560 1.2520 -9.5340 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.1760 0.5550 -8.1080 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.9730 0.9210 -6.8550 C 0 0 1 0 0 0 0 0 0 0 0 0
-2.7810 2.1950 -7.1240 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.9200 -0.2000 -6.5090 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.9910 -1.1750 -7.2160 O 0 0 0 0 0 0 0 0 0 0 0 0
-3.7750 -0.1010 -5.2690 C 0 0 2 0 0 0 0 0 0 0 0 0
-2.9350 0.0970 -4.1300 O 0 0 0 0 0 0 0 0 0 0 0 0
-2.7650 1.5080 -3.9830 C 0 0 0 0 0 0 0 0 0 0 0 0
-4.5910 -1.3810 -5.0870 C 0 0 2 0 0 0 0 0 0 0 0 0
-5.7300 -1.3510 -5.9480 O 0 0 0 0 0 0 0 0 0 0 0 0
-5.0540 -1.4780 -3.6520 C 0 0 0 0 0 0 0 0 0 0 0 0
-6.1090 -0.5300 -3.1490 C 0 0 0 0 0 0 0 0 0 0 0 0
-4.5170 -2.3830 -2.8800 C 0 0 0 0 0 0 0 0 0 0 0 0
-4.8930 -2.5500 -1.4300 C 0 0 1 0 0 0 0 0 0 0 0 0
-4.8210 -4.0370 -1.0630 C 0 0 0 0 0 0 0 0 0 0 0 0
-3.8900 -1.7740 -0.6090 C 0 0 0 0 0 0 0 0 0 0 0 0
-3.4830 -0.7100 -1.0220 O 0 0 0 0 0 0 0 0 0 0 0 0
-3.3920 -2.3030 0.7040 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.6090 -1.2170 1.4520 C 0 0 1 0 0 0 0 0 0 0 0 0
-3.5900 -0.2550 2.1240 C 0 0 2 0 0 0 0 0 0 0 0 0
-4.4730 0.3980 1.0580 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.8110 0.8260 2.8740 C 0 0 0 0 0 0 0 0 0 0 0 0
-3.7860 1.7040 3.6610 C 0 0 1 0 0 0 0 0 0 0 0 0
-4.4510 0.8710 4.7590 C 0 0 0 0 0 0 0 0 0 0 0 0
-5.4260 1.7490 5.5460 C 0 0 1 0 0 0 0 0 0 0 0 0
-6.0460 0.9710 6.5720 O 0 0 0 0 0 0 0 0 0 0 0 0
-7.3460 1.5260 6.7810 C 0 0 0 0 0 0 0 0 0 0 0 0
-4.6640 2.9150 6.1800 C 0 0 2 0 0 0 0 0 0 0 0 0
-5.5750 3.7350 6.9150 O 0 0 0 0 0 0 0 0 0 0 0 0
-4.0000 3.7480 5.0820 C 0 0 0 0 0 0 0 0 0 0 0 0
-3.0250 2.8700 4.2950 C 0 0 0 0 0 0 0 0 0 0 0 0
4.8030 -2.5110 -0.9950 C 0 0 0 0 0 0 0 0 0 0 0 0
6.1120 -2.2280 -1.2040 C 0 0 0 0 0 0 0 0 0 0 0 0
6.4930 -0.9110 -1.3840 C 0 0 0 0 0 0 0 0 0 0 0 0
4.1450 -3.8500 -0.7790 C 0 0 0 0 0 0 0 0 0 0 0 0
4.0620 -0.8880 -1.0410 S 0 0 0 0 0 0 0 0 0 0 0 0
0.0770 -3.0380 3.8490 H 0 0 0 0 0 0 0 0 0 0 0 0
-1.5890 -1.8360 5.2140 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.4290 -0.5030 5.4520 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.0020 -3.4180 6.2480 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.3530 -2.0760 7.3610 H 0 0 0 0 0 0 0 0 0 0 0 0
2.0800 -2.5780 7.3010 H 0 0 0 0 0 0 0 0 0 0 0 0
1.7450 -0.9500 6.6630 H 0 0 0 0 0 0 0 0 0 0 0 0
2.1540 -3.4630 4.9650 H 0 0 0 0 0 0 0 0 0 0 0 0
3.3350 -2.1320 5.1760 H 0 0 0 0 0 0 0 0 0 0 0 0
6.3710 -1.0230 2.7070 H 0 0 0 0 0 0 0 0 0 0 0 0
8.0740 0.6960 3.0480 H 0 0 0 0 0 0 0 0 0 0 0 0
6.8170 1.9170 3.3880 H 0 0 0 0 0 0 0 0 0 0 0 0
7.1900 0.3900 0.7630 H 0 0 0 0 0 0 0 0 0 0 0 0
7.5510 2.1180 1.0280 H 0 0 0 0 0 0 0 0 0 0 0 0
5.2120 2.4360 1.8310 H 0 0 0 0 0 0 0 0 0 0 0 0
3.3500 1.6510 3.2330 H 0 0 0 0 0 0 0 0 0 0 0 0
7.1360 -0.5820 5.0150 H 0 0 0 0 0 0 0 0 0 0 0 0
5.8430 0.6230 5.2200 H 0 0 0 0 0 0 0 0 0 0 0 0
5.4310 -1.0940 5.0040 H 0 0 0 0 0 0 0 0 0 0 0 0
5.6660 3.2750 -0.2390 H 0 0 0 0 0 0 0 0 0 0 0 0
4.1180 2.4310 -0.3660 H 0 0 0 0 0 0 0 0 0 0 0 0
6.7560 1.7870 -1.6720 H 0 0 0 0 0 0 0 0 0 0 0 0
5.1280 3.8690 -3.7370 H 0 0 0 0 0 0 0 0 0 0 0 0
3.6040 3.7320 -2.8280 H 0 0 0 0 0 0 0 0 0 0 0 0
5.1320 4.0280 -1.9650 H 0 0 0 0 0 0 0 0 0 0 0 0
4.4740 0.1510 -3.5930 H 0 0 0 0 0 0 0 0 0 0 0 0
3.4240 2.8510 -4.8190 H 0 0 0 0 0 0 0 0 0 0 0 0
2.8420 -0.0990 -5.2940 H 0 0 0 0 0 0 0 0 0 0 0 0
1.8440 2.6280 -6.5350 H 0 0 0 0 0 0 0 0 0 0 0 0
1.1250 -0.3040 -6.9730 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.0570 2.3960 -7.9020 H 0 0 0 0 0 0 0 0 0 0 0 0
1.6870 1.8180 -9.5570 H 0 0 0 0 0 0 0 0 0 0 0 0
0.9600 0.2070 -9.7670 H 0 0 0 0 0 0 0 0 0 0 0 0
0.0650 1.6600 -10.2710 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.9490 -0.5100 -8.0980 H 0 0 0 0 0 0 0 0 0 0 0 0
-1.7640 0.7950 -8.9950 H 0 0 0 0 0 0 0 0 0 0 0 0
-1.2920 1.0940 -6.0220 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.3510 2.4590 -6.2340 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.1010 3.0090 -7.3750 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.4640 2.0230 -7.9560 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.4560 0.7450 -5.3670 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.1280 1.7100 -3.1220 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.3010 1.9140 -4.8820 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.7380 1.9770 -3.8330 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.9720 -2.2470 -5.3190 H 0 0 0 0 0 0 0 0 0 0 0 0
-6.2150 -2.1740 -5.7980 H 0 0 0 0 0 0 0 0 0 0 0 0
-6.3130 -0.7370 -2.0990 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.7550 0.4950 -3.2540 H 0 0 0 0 0 0 0 0 0 0 0 0
-7.0220 -0.6610 -3.7300 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.7640 -3.0400 -3.3040 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.8960 -2.1880 -1.2300 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.5460 -4.5940 -1.6570 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.8190 -4.4140 -1.2660 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.0480 -4.1610 -0.0040 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.7370 -3.1570 0.5320 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.2360 -2.6120 1.3210 H 0 0 0 0 0 0 0 0 0 0 0 0
-1.9770 -0.6780 0.7470 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.2160 -0.8050 2.8260 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.8430 0.8690 0.3030 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.0980 -0.3600 0.5880 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.1060 1.1530 1.5250 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.2630 1.4410 2.1600 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.1090 0.3560 3.5630 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.5500 2.0930 2.9870 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.6870 0.4830 5.4330 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.9930 0.0410 4.3070 H 0 0 0 0 0 0 0 0 0 0 0 0
-6.1890 2.1380 4.8720 H 0 0 0 0 0 0 0 0 0 0 0 0
-7.8610 0.9620 7.5590 H 0 0 0 0 0 0 0 0 0 0 0 0
-7.2520 2.5670 7.0890 H 0 0 0 0 0 0 0 0 0 0 0 0
-7.9180 1.4710 5.8550 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.9010 2.5270 6.8540 H 0 0 0 0 0 0 0 0 0 0 0 0
-5.0560 4.4560 7.2970 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.4580 4.5790 5.5340 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.7640 4.1360 4.4080 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.2610 2.4820 4.9690 H 0 0 0 0 0 0 0 0 0 0 0 0
-2.5520 3.4630 3.5130 H 0 0 0 0 0 0 0 0 0 0 0 0
6.8470 -3.0190 -1.2330 H 0 0 0 0 0 0 0 0 0 0 0 0
7.5290 -0.6580 -1.5530 H 0 0 0 0 0 0 0 0 0 0 0 0
3.0730 -3.7090 -0.6420 H 0 0 0 0 0 0 0 0 0 0 0 0
4.3200 -4.4850 -1.6480 H 0 0 0 0 0 0 0 0 0 0 0 0
4.5660 -4.3230 0.1070 H 0 0 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0
1 52 1 0 0 0 0
2 3 2 0 0 0 0
2 4 1 0 0 0 0
4 5 1 0 0 0 0
4 9 1 0 0 0 0
4 70 1 0 0 0 0
5 6 1 0 0 0 0
5 71 1 0 0 0 0
5 72 1 0 0 0 0
6 7 1 0 0 0 0
6 73 1 0 0 0 0
6 74 1 0 0 0 0
7 8 1 0 0 0 0
7 75 1 0 0 0 0
7 76 1 0 0 0 0
8 9 1 0 0 0 0
8 77 1 0 0 0 0
8 78 1 0 0 0 0
9 10 1 0 0 0 0
10 11 2 0 0 0 0
10 12 1 0 0 0 0
12 13 2 0 0 0 0
12 14 1 0 0 0 0
14 15 1 0 0 0 0
14 19 1 0 0 0 0
14 20 1 0 0 0 0
15 16 1 0 0 0 0
15 21 1 0 0 0 0
15 79 1 0 0 0 0
16 17 1 0 0 0 0
16 80 1 0 0 0 0
16 81 1 0 0 0 0
17 18 1 0 0 0 0
17 82 1 0 0 0 0
17 83 1 0 0 0 0
18 19 1 0 0 0 0
18 22 1 0 0 0 0
18 84 1 0 0 0 0
20 85 1 0 0 0 0
21 86 1 0 0 0 0
21 87 1 0 0 0 0
21 88 1 0 0 0 0
22 23 1 0 0 0 0
22 89 1 0 0 0 0
22 90 1 0 0 0 0
23 24 1 0 0 0 0
23 25 1 0 0 0 0
23 91 1 0 0 0 0
24 67 2 0 0 0 0
24 69 1 0 0 0 0
25 26 1 0 0 0 0
25 27 2 0 0 0 0
26 92 1 0 0 0 0
26 93 1 0 0 0 0
26 94 1 0 0 0 0
27 28 1 0 0 0 0
27 95 1 0 0 0 0
28 29 2 0 0 0 0
28 96 1 0 0 0 0
29 30 1 0 0 0 0
29 97 1 0 0 0 0
30 31 2 0 0 0 0
30 98 1 0 0 0 0
31 32 1 0 0 0 0
31 99 1 0 0 0 0
32 33 1 0 0 0 0
32 34 1 0 0 0 0
32100 1 0 0 0 0
33101 1 0 0 0 0
33102 1 0 0 0 0
33103 1 0 0 0 0
34 35 1 0 0 0 0
34104 1 0 0 0 0
34105 1 0 0 0 0
35 36 1 0 0 0 0
35 37 1 0 0 0 0
35106 1 0 0 0 0
36107 1 0 0 0 0
36108 1 0 0 0 0
36109 1 0 0 0 0
37 38 2 0 0 0 0
37 39 1 0 0 0 0
39 40 1 0 0 0 0
39 42 1 0 0 0 0
39110 1 0 0 0 0
40 41 1 0 0 0 0
41111 1 0 0 0 0
41112 1 0 0 0 0
41113 1 0 0 0 0
42 43 1 0 0 0 0
42 44 1 0 0 0 0
42114 1 0 0 0 0
43115 1 0 0 0 0
44 45 1 0 0 0 0
44 46 2 0 0 0 0
45116 1 0 0 0 0
45117 1 0 0 0 0
45118 1 0 0 0 0
46 47 1 0 0 0 0
46119 1 0 0 0 0
47 48 1 0 0 0 0
47 49 1 0 0 0 0
47120 1 0 0 0 0
48121 1 0 0 0 0
48122 1 0 0 0 0
48123 1 0 0 0 0
49 50 2 0 0 0 0
49 51 1 0 0 0 0
51 52 1 0 0 0 0
51124 1 0 0 0 0
51125 1 0 0 0 0
52 53 1 0 0 0 0
52126 1 0 0 0 0
53 54 1 0 0 0 0
53 55 1 0 0 0 0
53127 1 0 0 0 0
54128 1 0 0 0 0
54129 1 0 0 0 0
54130 1 0 0 0 0
55 56 1 0 0 0 0
55131 1 0 0 0 0
55132 1 0 0 0 0
56 57 1 0 0 0 0
56 64 1 0 0 0 0
56133 1 0 0 0 0
57 58 1 0 0 0 0
57134 1 0 0 0 0
57135 1 0 0 0 0
58 59 1 0 0 0 0
58 61 1 0 0 0 0
58136 1 0 0 0 0
59 60 1 0 0 0 0
60137 1 0 0 0 0
60138 1 0 0 0 0
60139 1 0 0 0 0
61 62 1 0 0 0 0
61 63 1 0 0 0 0
61140 1 0 0 0 0
62141 1 0 0 0 0
63 64 1 0 0 0 0
63142 1 0 0 0 0
63143 1 0 0 0 0
64144 1 0 0 0 0
64145 1 0 0 0 0
65 66 2 0 0 0 0
65 68 1 0 0 0 0
65 69 1 0 0 0 0
66 67 1 0 0 0 0
66146 1 0 0 0 0
67147 1 0 0 0 0
68148 1 0 0 0 0
68149 1 0 0 0 0
68150 1 0 0 0 0
M END
> <OPENEYE_ISO_SMILES>
Cc1ccc(s1)[C@@H]\2C[C@@H]3CC[C@H]([C@@](O3)(C(=O)C(=O)N4CCCC[C@H]4C(=O)O[C@@H](CC(=O)[C@@H](/C=C(/[C@H]([C@H](C(=O)[C@@H](C[C@@H](/C=C/C=C/C=C2\C)C)C)OC)O)\C)C)[C@H](C)C[C@@H]5CC[C@H]([C@@H](C5)OC)O)O)C
> <OPENEYE_INCHI>
InChI=1S/C55H81NO12S/c1-32-16-12-11-13-17-33(2)42(48-24-20-39(8)69-48)30-41-22-19-38(7)55(64,68-41)52(61)53(62)56-25-15-14-18-43(56)54(63)67-46(35(4)28-40-21-23-44(57)47(29-40)65-9)31-45(58)34(3)27-37(6)50(60)51(66-10)49(59)36(5)26-32/h11-13,16-17,20,24,27,32,34-36,38,40-44,46-47,50-51,57,60,64H,14-15,18-19,21-23,25-26,28-31H2,1-10H3/b13-11+,16-12+,33-17+,37-27+/t32-,34-,35-,36-,38-,40+,41+,42-,43+,44-,46+,47-,50-,51+,55-/m1/s1
> <OPENEYE_INCHIKEY>
SDSGJAIFUCCAOV-MSLSVLDMSA-N
> <FORMULA>
C55H81NO12S
$$$$

View file

@ -0,0 +1,14 @@
defaults:
- base
job_name: "3fap"
protein_inputs:
A:
fasta_file: examples/protein/3fap_A.fasta
B:
fasta_file: examples/protein/3fap_B.fasta
sm_inputs:
C:
input: examples/small_molecule/ARD_ideal.sdf
input_type: "sdf"

View file

@ -28,6 +28,10 @@ class RawInputData:
def query_sequence(self): def query_sequence(self):
return self.msa[0] return self.msa[0]
def sequence_string(self):
three_letter_sequence = [ChemData().num2aa[num] for num in self.query_sequence()]
return "".join([ChemData().aa_321[three] for three in three_letter_sequence])
def is_atom(self): def is_atom(self):
return is_atom(self.query_sequence()) return is_atom(self.query_sequence())

View file

@ -548,7 +548,7 @@ def join_msas_by_taxid(a3mA, a3mB, idx_overlap=None):
# pair sequences # pair sequences
taxids_shared = a3mA['taxid'][np.isin(a3mA['taxid'],a3mB['taxid'])] taxids_shared = a3mA['taxid'][np.isin(a3mA['taxid'],a3mB['taxid'])]
i_pairedA, i_pairedB = [], [] i_pairedA, i_pairedB = [], []
for taxid in taxids_shared: for taxid in taxids_shared:
i_match = np.where(a3mA['taxid']==taxid)[0] i_match = np.where(a3mA['taxid']==taxid)[0]
i_match_best = torch.argmin(torch.sum(a3mA['msa'][i_match]==a3mA['msa'][0], axis=1)) i_match_best = torch.argmin(torch.sum(a3mA['msa'][i_match]==a3mA['msa'][0], axis=1))
@ -744,7 +744,7 @@ def load_minimal_multi_msa(hash_list, taxid_list, Ls, params):
return a3m_out, hashes_out, Ls_out return a3m_out, hashes_out, Ls_out
def expand_multi_msa(a3m, hashes_in, hashes_out, Ls_in, Ls_out, params): def expand_multi_msa(a3m, hashes_in, hashes_out, Ls_in, Ls_out):
"""Expands a multi-MSA of unique chains into an MSA of a """Expands a multi-MSA of unique chains into an MSA of a
hetero-homo-oligomer in which some chains appear more than once. The query hetero-homo-oligomer in which some chains appear more than once. The query
sequences (1st sequence of MSA) are concatenated directly along the sequences (1st sequence of MSA) are concatenated directly along the

View file

@ -1,6 +1,7 @@
import torch import torch
from hashlib import md5
from rf2aa.data.data_loader_utils import merge_a3m_hetero, merge_a3m_homo, merge_hetero_templates, get_term_feats from rf2aa.data.data_loader_utils import merge_a3m_hetero, merge_a3m_homo, merge_hetero_templates, get_term_feats, join_msas_by_taxid, expand_multi_msa
from rf2aa.data.data_loader import RawInputData from rf2aa.data.data_loader import RawInputData
from rf2aa.util import center_and_realign_missing, same_chain_from_bond_feats, random_rot_trans, idx_from_Ls from rf2aa.util import center_and_realign_missing, same_chain_from_bond_feats, random_rot_trans, idx_from_Ls
@ -18,7 +19,71 @@ def merge_protein_inputs(protein_inputs, deterministic: bool = False):
# handle merging MSAs and such # handle merging MSAs and such
# first determine which sequence are identical, then which one have mergeable MSAs # first determine which sequence are identical, then which one have mergeable MSAs
# then cat the templates, other feats # then cat the templates, other feats
pass else:
a3m_list = [
{"msa": input.msa,
"ins": input.ins,
"taxid": input.taxids
}
for input in protein_inputs.values()
]
hash_list = [md5(input.sequence_string().encode()).hexdigest() for input in protein_inputs.values()]
lengths_list = [input.length() for input in protein_inputs.values()]
seen = set()
unique_indices = []
for idx, hash in enumerate(hash_list):
if hash not in seen:
unique_indices.append(idx)
seen.add(hash)
unique_a3m = [a3m for i, a3m in enumerate(a3m_list) if i in unique_indices ]
unique_hashes = [value for index, value in enumerate(hash_list) if index in unique_indices]
unique_lengths_list = [value for index, value in enumerate(lengths_list) if index in unique_indices]
if len(unique_a3m) >1:
a3m_out = unique_a3m[0]
for i in range(1, len(unique_a3m)):
a3m_out = join_msas_by_taxid(a3m_out, a3m_list[i])
a3m_out = expand_multi_msa(a3m_out, unique_hashes, hash_list, unique_lengths_list, lengths_list)
else:
a3m = unique_a3m[0]
msa, ins = a3m["msa"], a3m["ins"]
a3m_out = merge_a3m_homo(msa, ins, len(hash_list))
# merge templates
max_template_dim = max([input.xyz_t.shape[0] for input in protein_inputs.values()])
xyz_t_list = [input.xyz_t for input in protein_inputs.values()]
mask_t_list = [input.mask_t for input in protein_inputs.values()]
t1d_list = [input.t1d for input in protein_inputs.values()]
ids = ["inference"] * len(t1d_list)
xyz_t, t1d, mask_t, _ = merge_hetero_templates(xyz_t_list, t1d_list, mask_t_list, ids, lengths_list, deterministic=deterministic)
atom_frames = torch.zeros(0,3,2)
chirals = torch.zeros(0,5)
L_total = sum(lengths_list)
bond_feats = torch.zeros((L_total, L_total)).long()
offset = 0
for bf in [input.bond_feats for input in protein_inputs.values()]:
L = bf.shape[0]
bond_feats[offset:offset+L, offset:offset+L] = bf
offset += L
chain_lengths = list(zip(protein_inputs.keys(), lengths_list))
merged_input = RawInputData(
a3m_out["msa"],
a3m_out["ins"],
bond_feats,
xyz_t[:max_template_dim],
mask_t[:max_template_dim],
t1d[:max_template_dim],
chirals,
atom_frames,
taxids=None
)
return merged_input, chain_lengths
def merge_na_inputs(na_inputs): def merge_na_inputs(na_inputs):
# should just be trivially catting features # should just be trivially catting features
@ -101,14 +166,6 @@ def merge_all(
deterministic: bool = False, deterministic: bool = False,
): ):
#protein_lengths = [protein_input.length() for protein_input in protein_inputs.values()]
#na_lengths = [na_input.length() for na_input in na_inputs.values()]
#sm_lengths = [sm_input.length() for sm_input in sm_inputs.values()]
#all_lengths = protein_lengths + na_lengths + sm_lengths
#term_info = get_term_feats(all_lengths)
#term_info[sum(protein_lengths):, :] = 0
protein_inputs, protein_chain_lengths = merge_protein_inputs(protein_inputs, deterministic=deterministic) protein_inputs, protein_chain_lengths = merge_protein_inputs(protein_inputs, deterministic=deterministic)
na_inputs, na_chain_lengths = merge_na_inputs(na_inputs) na_inputs, na_chain_lengths = merge_na_inputs(na_inputs)

View file

@ -414,18 +414,21 @@ def parse_a3m(filename, maxseq=8000, paired=False):
else: else:
fstream = open(filename, 'r') fstream = open(filename, 'r')
for line in fstream: for i, line in enumerate(fstream):
# skip labels # skip labels
if line[0] == '>': if line[0] == '>':
if paired: # paired MSAs only have a TAXID in the fasta header if paired: # paired MSAs only have a TAXID in the fasta header
taxIDs.append(line[1:].strip()) taxIDs.append(line[1:].strip())
else: # unpaired MSAs have all the metadata so use regex to pull out TAXID else: # unpaired MSAs have all the metadata so use regex to pull out TAXID
match = re.search( r'TaxID=(\d+)', line) if i == 0:
if match: taxIDs.append("query")
taxIDs.append(match.group(1))
else: else:
taxIDs.append("query") # query sequence match = re.search( r'TaxID=(\d+)', line)
if match:
taxIDs.append(match.group(1))
else:
taxIDs.append("") # query sequence
continue continue
# remove right whitespaces # remove right whitespaces

View file

@ -8,11 +8,12 @@ import subprocess
def make_msa( def make_msa(
fasta_file, fasta_file,
chain,
model_runner model_runner
): ):
out_dir_base = Path(model_runner.config.output_path) out_dir_base = Path(model_runner.config.output_path)
hash = model_runner.config.job_name hash = model_runner.config.job_name
out_dir = out_dir_base / hash out_dir = out_dir_base / hash / chain
out_dir.mkdir(parents=True, exist_ok=True) out_dir.mkdir(parents=True, exist_ok=True)
command = model_runner.config.database_params.command command = model_runner.config.database_params.command

View file

@ -88,6 +88,6 @@ def load_protein(msa_file, hhr_fn, atab_fn, model_runner):
taxids=taxIDs, taxids=taxIDs,
) )
def generate_msa_and_load_protein(fasta_file, model_runner): def generate_msa_and_load_protein(fasta_file, chain, model_runner):
msa_file, hhr_file, atab_file = make_msa(fasta_file, model_runner) msa_file, hhr_file, atab_file = make_msa(fasta_file, chain, model_runner)
return load_protein(str(msa_file), str(hhr_file), str(atab_file), model_runner) return load_protein(str(msa_file), str(hhr_file), str(atab_file), model_runner)

View file

@ -45,6 +45,7 @@ class ModelRunner:
chains.append(chain) chains.append(chain)
protein_input = generate_msa_and_load_protein( protein_input = generate_msa_and_load_protein(
self.config.protein_inputs[chain]["fasta_file"], self.config.protein_inputs[chain]["fasta_file"],
chain,
self self
) )
protein_inputs[chain] = protein_input protein_inputs[chain] = protein_input

View file

@ -1,79 +0,0 @@
import torch
import pandas as pd
import numpy as np
import itertools
from collections import OrderedDict
from hydra import initialize, compose
from rf2aa.setup_model import trainer_factory, seed_all
from rf2aa.chemical import ChemicalData as ChemData
# configurations to test
configs = ["legacy_train"]
datasets = ["compl", "na_compl", "rna", "sm_compl", "sm_compl_covale", "sm_compl_asmb"]
cfg_overrides = [
"loader_params.p_msa_mask=0.0",
"loader_params.crop=100000",
"loader_params.mintplt=0",
"loader_params.maxtplt=2"
]
def make_deterministic(seed=0):
seed_all(seed)
if torch.cuda.is_available():
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def setup_dataset_names():
data = {}
for name in datasets:
data[name] = [name]
return data
# set up models for regression tests
def setup_models(device="cpu"):
models, chem_cfgs = [], []
for config in configs:
with initialize(version_base=None, config_path="../config/train"):
cfg = compose(config_name=config, overrides=cfg_overrides)
# initializing the model needs the chemical DB initialized. Force a reload
ChemData.reset()
ChemData(cfg.chem_params)
trainer = trainer_factory[cfg.experiment.trainer](cfg)
seed_all()
trainer.construct_model(device=device)
models.append(trainer.model)
chem_cfgs.append(cfg.chem_params)
trainer = None
return dict(zip(configs, (zip(configs, models, chem_cfgs))))
# set up job array for regression
def setup_array(datasets, models, device="cpu"):
test_data = setup_dataset_names()
test_models = setup_models(device=device)
test_data = [test_data[dataset] for dataset in datasets]
test_models = [test_models[model] for model in models]
return (list(itertools.product(test_data, test_models)))
def random_param_init(model):
seed_all()
with torch.no_grad():
fake_state_dict = OrderedDict()
for name, param in model.model.named_parameters():
fake_state_dict[name] = torch.randn_like(param)
model.model.load_state_dict(fake_state_dict)
model.shadow.load_state_dict(fake_state_dict)
return model
def dataset_pickle_path(dataset_name):
return f"test_pickles/data/{dataset_name}_regression.pt"
def model_pickle_path(dataset_name, model_name):
return f"test_pickles/model/{model_name}_{dataset_name}_regression.pt"
def loss_pickle_path(dataset_name, model_name, loss_name):
return f"test_pickles/loss/{loss_name}_{model_name}_{dataset_name}_regression.pt"

View file

@ -1,73 +0,0 @@
import os
import torch
import pytest
import warnings
warnings.filterwarnings("ignore")
from rf2aa.data.dataloader_adaptor import prepare_input
from rf2aa.training.recycling import run_model_forward_legacy
from rf2aa.tensor_util import assert_equal
from rf2aa.tests.test_conditions import setup_array,\
make_deterministic, dataset_pickle_path, model_pickle_path
from rf2aa.util_module import XYZConverter
from rf2aa.chemical import ChemicalData as ChemData
# goal is to test all the configs on a broad set of datasets
gpu = "cuda:0" if torch.cuda.is_available() else "cpu"
legacy_test_conditions = setup_array(["na_compl", "rna", "sm_compl", "sm_compl_covale"], ["legacy_train"], device=gpu)
@pytest.mark.parametrize("example,model", legacy_test_conditions)
def test_regression_legacy(example, model):
dataset_name, dataset_inputs, model_name, model = setup_test(example, model)
make_deterministic()
output_i = run_model_forward_legacy(model, dataset_inputs, gpu)
model_pickle = model_pickle_path(dataset_name, model_name)
output_names = ("logits_c6d", "logits_aa", "logits_pae", \
"logits_pde", "p_bind", "xyz", "alpha", "xyz_allatom", \
"lddt", "seq", "pair", "state")
if not os.path.exists(model_pickle):
torch.save(output_i, model_pickle)
else:
output_regression = torch.load(model_pickle, map_location=gpu)
for idx, output in enumerate(output_i):
got = output
want = output_regression[idx]
if output_names[idx] == "logits_c6d":
for i in range(len(want)):
got_i = got[i]
want_i = want[i]
try:
assert_equal(got_i, want_i)
except Exception as e:
raise ValueError(f"{output_names[idx]} not same for model: {model_name} on dataset: {dataset_name}") from e
elif output_names[idx] in ["alpha", "xyz_allatom", "seq", "pair", "state"]:
try:
assert torch.allclose(got, want, atol=1e-4)
except Exception as e:
raise ValueError(f"{output_names[idx]} not same for model: {model_name} on dataset: {dataset_name}") from e
else:
try:
assert_equal(got, want)
except Exception as e:
raise ValueError(f"{output_names[idx]} not same for model: {model_name} on dataset: {dataset_name}") from e
def setup_test(example, model):
model_name, model, config = model
# initialize chemical database
ChemData.reset() # force reload chemical data
ChemData(config)
model = model.to(gpu)
dataset_name = example[0]
dataloader_inputs = torch.load(dataset_pickle_path(dataset_name), map_location=gpu)
xyz_converter = XYZConverter().to(gpu)
task, item, network_input, true_crds, mask_crds, msa, mask_msa, unclamp, \
negative, symmRs, Lasu, ch_label = prepare_input(dataloader_inputs,xyz_converter, gpu)
return dataset_name, network_input, model_name, model