diff --git a/examples/protein/3fap_A.fasta b/examples/protein/3fap_A.fasta new file mode 100644 index 0000000..9bfb9e2 --- /dev/null +++ b/examples/protein/3fap_A.fasta @@ -0,0 +1,2 @@ +>3FAP_1|Chain A|FK506-BINDING PROTEIN|Homo sapiens (9606) +GVQVETISPGDGRTFPKRGQTCVVHYTGMLEDGKKFDSSRDRNKPFKFMLGKQEVIRGWEEGVAQMSVGQRAKLTISPDYAYGATGHPGIIPPHATLVFDVELLKLE diff --git a/examples/protein/3fap_B.fasta b/examples/protein/3fap_B.fasta new file mode 100644 index 0000000..b05bdb9 --- /dev/null +++ b/examples/protein/3fap_B.fasta @@ -0,0 +1,2 @@ +>3FAP_2|Chain B|FKBP12-RAPAMYCIN ASSOCIATED PROTEIN|Homo sapiens (9606) +VAILWHEMWHEGLEEASRLYFGERNVKGMFEVLEPLHAMMERGPQTLKETSFNQAYGRDLMEAQEWCRKYMKSGNVKDLTQAWDLYYHVFRRIS diff --git a/examples/small_molecule/ARD_ideal.sdf b/examples/small_molecule/ARD_ideal.sdf new file mode 100644 index 0000000..c1111a3 --- /dev/null +++ b/examples/small_molecule/ARD_ideal.sdf @@ -0,0 +1,322 @@ +ARD + -OEChem-02232415173D + +150154 0 1 0 0 0 0 0999 V2000 + -1.7790 -1.8400 2.4660 O 0 0 0 0 0 0 0 0 0 0 0 0 + -0.5750 -1.3280 2.8030 C 0 0 0 0 0 0 0 0 0 0 0 0 + -0.1380 -0.4090 2.1630 O 0 0 0 0 0 0 0 0 0 0 0 0 + 0.1530 -1.9370 3.9570 C 0 0 2 0 0 0 0 0 0 0 0 0 + -0.5340 -1.5700 5.2770 C 0 0 0 0 0 0 0 0 0 0 0 0 + 0.1240 -2.3500 6.4190 C 0 0 0 0 0 0 0 0 0 0 0 0 + 1.6170 -2.0190 6.4880 C 0 0 0 0 0 0 0 0 0 0 0 0 + 2.2820 -2.4020 5.1570 C 0 0 0 0 0 0 0 0 0 0 0 0 + 1.5790 -1.6200 4.1030 N 0 0 0 0 0 0 0 0 0 0 0 0 + 2.2890 -0.7310 3.4090 C 0 0 0 0 0 0 0 0 0 0 0 0 + 1.7590 0.2990 3.0480 O 0 0 0 0 0 0 0 0 0 0 0 0 + 3.7290 -0.8580 3.1350 C 0 0 0 0 0 0 0 0 0 0 0 0 + 4.1990 -1.9680 3.1760 O 0 0 0 0 0 0 0 0 0 0 0 0 + 4.6500 0.2750 2.7820 C 0 0 1 0 0 0 0 0 0 0 0 0 + 6.0730 -0.0990 3.2010 C 0 0 1 0 0 0 0 0 0 0 0 0 + 7.0540 1.0150 2.8270 C 0 0 0 0 0 0 0 0 0 0 0 0 + 6.9100 1.2850 1.3170 C 0 0 0 0 0 0 0 0 0 0 0 0 + 5.4410 1.6400 1.0770 C 0 0 2 0 0 0 0 0 0 0 0 0 + 4.6020 0.5430 1.3850 O 0 0 0 0 0 0 0 0 0 0 0 0 + 4.2490 1.4380 3.5170 O 0 0 0 0 0 0 0 0 0 0 0 0 + 6.1240 -0.3020 4.7200 C 0 0 0 0 0 0 0 0 0 0 0 0 + 5.2000 2.2840 -0.2620 C 0 0 0 0 0 0 0 0 0 0 0 0 + 5.7060 1.5450 -1.4960 C 0 0 1 0 0 0 0 0 0 0 0 0 + 5.5470 0.0580 -1.3360 C 0 0 0 0 0 0 0 0 0 0 0 0 + 4.8870 2.0220 -2.6780 C 0 0 0 0 0 0 0 0 0 0 0 0 + 4.6730 3.5180 -2.8110 C 0 0 0 0 0 0 0 0 0 0 0 0 + 4.3300 1.2230 -3.5690 C 0 0 0 0 0 0 0 0 0 0 0 0 + 3.4840 1.7930 -4.6240 C 0 0 0 0 0 0 0 0 0 0 0 0 + 2.7570 0.9680 -5.4040 C 0 0 0 0 0 0 0 0 0 0 0 0 + 1.8610 1.5580 -6.3980 C 0 0 0 0 0 0 0 0 0 0 0 0 + 1.0770 0.7650 -7.1230 C 0 0 0 0 0 0 0 0 0 0 0 0 + 0.1320 1.3510 -8.1390 C 0 0 1 0 0 0 0 0 0 0 0 0 + 0.7560 1.2520 -9.5340 C 0 0 0 0 0 0 0 0 0 0 0 0 + -1.1760 0.5550 -8.1080 C 0 0 0 0 0 0 0 0 0 0 0 0 + -1.9730 0.9210 -6.8550 C 0 0 1 0 0 0 0 0 0 0 0 0 + -2.7810 2.1950 -7.1240 C 0 0 0 0 0 0 0 0 0 0 0 0 + -2.9200 -0.2000 -6.5090 C 0 0 0 0 0 0 0 0 0 0 0 0 + -2.9910 -1.1750 -7.2160 O 0 0 0 0 0 0 0 0 0 0 0 0 + -3.7750 -0.1010 -5.2690 C 0 0 2 0 0 0 0 0 0 0 0 0 + -2.9350 0.0970 -4.1300 O 0 0 0 0 0 0 0 0 0 0 0 0 + -2.7650 1.5080 -3.9830 C 0 0 0 0 0 0 0 0 0 0 0 0 + -4.5910 -1.3810 -5.0870 C 0 0 2 0 0 0 0 0 0 0 0 0 + -5.7300 -1.3510 -5.9480 O 0 0 0 0 0 0 0 0 0 0 0 0 + -5.0540 -1.4780 -3.6520 C 0 0 0 0 0 0 0 0 0 0 0 0 + -6.1090 -0.5300 -3.1490 C 0 0 0 0 0 0 0 0 0 0 0 0 + -4.5170 -2.3830 -2.8800 C 0 0 0 0 0 0 0 0 0 0 0 0 + -4.8930 -2.5500 -1.4300 C 0 0 1 0 0 0 0 0 0 0 0 0 + -4.8210 -4.0370 -1.0630 C 0 0 0 0 0 0 0 0 0 0 0 0 + -3.8900 -1.7740 -0.6090 C 0 0 0 0 0 0 0 0 0 0 0 0 + -3.4830 -0.7100 -1.0220 O 0 0 0 0 0 0 0 0 0 0 0 0 + -3.3920 -2.3030 0.7040 C 0 0 0 0 0 0 0 0 0 0 0 0 + -2.6090 -1.2170 1.4520 C 0 0 1 0 0 0 0 0 0 0 0 0 + -3.5900 -0.2550 2.1240 C 0 0 2 0 0 0 0 0 0 0 0 0 + -4.4730 0.3980 1.0580 C 0 0 0 0 0 0 0 0 0 0 0 0 + -2.8110 0.8260 2.8740 C 0 0 0 0 0 0 0 0 0 0 0 0 + -3.7860 1.7040 3.6610 C 0 0 1 0 0 0 0 0 0 0 0 0 + -4.4510 0.8710 4.7590 C 0 0 0 0 0 0 0 0 0 0 0 0 + -5.4260 1.7490 5.5460 C 0 0 1 0 0 0 0 0 0 0 0 0 + -6.0460 0.9710 6.5720 O 0 0 0 0 0 0 0 0 0 0 0 0 + -7.3460 1.5260 6.7810 C 0 0 0 0 0 0 0 0 0 0 0 0 + -4.6640 2.9150 6.1800 C 0 0 2 0 0 0 0 0 0 0 0 0 + -5.5750 3.7350 6.9150 O 0 0 0 0 0 0 0 0 0 0 0 0 + -4.0000 3.7480 5.0820 C 0 0 0 0 0 0 0 0 0 0 0 0 + -3.0250 2.8700 4.2950 C 0 0 0 0 0 0 0 0 0 0 0 0 + 4.8030 -2.5110 -0.9950 C 0 0 0 0 0 0 0 0 0 0 0 0 + 6.1120 -2.2280 -1.2040 C 0 0 0 0 0 0 0 0 0 0 0 0 + 6.4930 -0.9110 -1.3840 C 0 0 0 0 0 0 0 0 0 0 0 0 + 4.1450 -3.8500 -0.7790 C 0 0 0 0 0 0 0 0 0 0 0 0 + 4.0620 -0.8880 -1.0410 S 0 0 0 0 0 0 0 0 0 0 0 0 + 0.0770 -3.0380 3.8490 H 0 0 0 0 0 0 0 0 0 0 0 0 + -1.5890 -1.8360 5.2140 H 0 0 0 0 0 0 0 0 0 0 0 0 + -0.4290 -0.5030 5.4520 H 0 0 0 0 0 0 0 0 0 0 0 0 + -0.0020 -3.4180 6.2480 H 0 0 0 0 0 0 0 0 0 0 0 0 + -0.3530 -2.0760 7.3610 H 0 0 0 0 0 0 0 0 0 0 0 0 + 2.0800 -2.5780 7.3010 H 0 0 0 0 0 0 0 0 0 0 0 0 + 1.7450 -0.9500 6.6630 H 0 0 0 0 0 0 0 0 0 0 0 0 + 2.1540 -3.4630 4.9650 H 0 0 0 0 0 0 0 0 0 0 0 0 + 3.3350 -2.1320 5.1760 H 0 0 0 0 0 0 0 0 0 0 0 0 + 6.3710 -1.0230 2.7070 H 0 0 0 0 0 0 0 0 0 0 0 0 + 8.0740 0.6960 3.0480 H 0 0 0 0 0 0 0 0 0 0 0 0 + 6.8170 1.9170 3.3880 H 0 0 0 0 0 0 0 0 0 0 0 0 + 7.1900 0.3900 0.7630 H 0 0 0 0 0 0 0 0 0 0 0 0 + 7.5510 2.1180 1.0280 H 0 0 0 0 0 0 0 0 0 0 0 0 + 5.2120 2.4360 1.8310 H 0 0 0 0 0 0 0 0 0 0 0 0 + 3.3500 1.6510 3.2330 H 0 0 0 0 0 0 0 0 0 0 0 0 + 7.1360 -0.5820 5.0150 H 0 0 0 0 0 0 0 0 0 0 0 0 + 5.8430 0.6230 5.2200 H 0 0 0 0 0 0 0 0 0 0 0 0 + 5.4310 -1.0940 5.0040 H 0 0 0 0 0 0 0 0 0 0 0 0 + 5.6660 3.2750 -0.2390 H 0 0 0 0 0 0 0 0 0 0 0 0 + 4.1180 2.4310 -0.3660 H 0 0 0 0 0 0 0 0 0 0 0 0 + 6.7560 1.7870 -1.6720 H 0 0 0 0 0 0 0 0 0 0 0 0 + 5.1280 3.8690 -3.7370 H 0 0 0 0 0 0 0 0 0 0 0 0 + 3.6040 3.7320 -2.8280 H 0 0 0 0 0 0 0 0 0 0 0 0 + 5.1320 4.0280 -1.9650 H 0 0 0 0 0 0 0 0 0 0 0 0 + 4.4740 0.1510 -3.5930 H 0 0 0 0 0 0 0 0 0 0 0 0 + 3.4240 2.8510 -4.8190 H 0 0 0 0 0 0 0 0 0 0 0 0 + 2.8420 -0.0990 -5.2940 H 0 0 0 0 0 0 0 0 0 0 0 0 + 1.8440 2.6280 -6.5350 H 0 0 0 0 0 0 0 0 0 0 0 0 + 1.1250 -0.3040 -6.9730 H 0 0 0 0 0 0 0 0 0 0 0 0 + -0.0570 2.3960 -7.9020 H 0 0 0 0 0 0 0 0 0 0 0 0 + 1.6870 1.8180 -9.5570 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.9600 0.2070 -9.7670 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.0650 1.6600 -10.2710 H 0 0 0 0 0 0 0 0 0 0 0 0 + -0.9490 -0.5100 -8.0980 H 0 0 0 0 0 0 0 0 0 0 0 0 + -1.7640 0.7950 -8.9950 H 0 0 0 0 0 0 0 0 0 0 0 0 + -1.2920 1.0940 -6.0220 H 0 0 0 0 0 0 0 0 0 0 0 0 + -3.3510 2.4590 -6.2340 H 0 0 0 0 0 0 0 0 0 0 0 0 + -2.1010 3.0090 -7.3750 H 0 0 0 0 0 0 0 0 0 0 0 0 + -3.4640 2.0230 -7.9560 H 0 0 0 0 0 0 0 0 0 0 0 0 + -4.4560 0.7450 -5.3670 H 0 0 0 0 0 0 0 0 0 0 0 0 + -2.1280 1.7100 -3.1220 H 0 0 0 0 0 0 0 0 0 0 0 0 + -2.3010 1.9140 -4.8820 H 0 0 0 0 0 0 0 0 0 0 0 0 + -3.7380 1.9770 -3.8330 H 0 0 0 0 0 0 0 0 0 0 0 0 + -3.9720 -2.2470 -5.3190 H 0 0 0 0 0 0 0 0 0 0 0 0 + -6.2150 -2.1740 -5.7980 H 0 0 0 0 0 0 0 0 0 0 0 0 + -6.3130 -0.7370 -2.0990 H 0 0 0 0 0 0 0 0 0 0 0 0 + -5.7550 0.4950 -3.2540 H 0 0 0 0 0 0 0 0 0 0 0 0 + -7.0220 -0.6610 -3.7300 H 0 0 0 0 0 0 0 0 0 0 0 0 + -3.7640 -3.0400 -3.3040 H 0 0 0 0 0 0 0 0 0 0 0 0 + -5.8960 -2.1880 -1.2300 H 0 0 0 0 0 0 0 0 0 0 0 0 + -5.5460 -4.5940 -1.6570 H 0 0 0 0 0 0 0 0 0 0 0 0 + -3.8190 -4.4140 -1.2660 H 0 0 0 0 0 0 0 0 0 0 0 0 + -5.0480 -4.1610 -0.0040 H 0 0 0 0 0 0 0 0 0 0 0 0 + -2.7370 -3.1570 0.5320 H 0 0 0 0 0 0 0 0 0 0 0 0 + -4.2360 -2.6120 1.3210 H 0 0 0 0 0 0 0 0 0 0 0 0 + -1.9770 -0.6780 0.7470 H 0 0 0 0 0 0 0 0 0 0 0 0 + -4.2160 -0.8050 2.8260 H 0 0 0 0 0 0 0 0 0 0 0 0 + -3.8430 0.8690 0.3030 H 0 0 0 0 0 0 0 0 0 0 0 0 + -5.0980 -0.3600 0.5880 H 0 0 0 0 0 0 0 0 0 0 0 0 + -5.1060 1.1530 1.5250 H 0 0 0 0 0 0 0 0 0 0 0 0 + -2.2630 1.4410 2.1600 H 0 0 0 0 0 0 0 0 0 0 0 0 + -2.1090 0.3560 3.5630 H 0 0 0 0 0 0 0 0 0 0 0 0 + -4.5500 2.0930 2.9870 H 0 0 0 0 0 0 0 0 0 0 0 0 + -3.6870 0.4830 5.4330 H 0 0 0 0 0 0 0 0 0 0 0 0 + -4.9930 0.0410 4.3070 H 0 0 0 0 0 0 0 0 0 0 0 0 + -6.1890 2.1380 4.8720 H 0 0 0 0 0 0 0 0 0 0 0 0 + -7.8610 0.9620 7.5590 H 0 0 0 0 0 0 0 0 0 0 0 0 + -7.2520 2.5670 7.0890 H 0 0 0 0 0 0 0 0 0 0 0 0 + -7.9180 1.4710 5.8550 H 0 0 0 0 0 0 0 0 0 0 0 0 + -3.9010 2.5270 6.8540 H 0 0 0 0 0 0 0 0 0 0 0 0 + -5.0560 4.4560 7.2970 H 0 0 0 0 0 0 0 0 0 0 0 0 + -3.4580 4.5790 5.5340 H 0 0 0 0 0 0 0 0 0 0 0 0 + -4.7640 4.1360 4.4080 H 0 0 0 0 0 0 0 0 0 0 0 0 + -2.2610 2.4820 4.9690 H 0 0 0 0 0 0 0 0 0 0 0 0 + -2.5520 3.4630 3.5130 H 0 0 0 0 0 0 0 0 0 0 0 0 + 6.8470 -3.0190 -1.2330 H 0 0 0 0 0 0 0 0 0 0 0 0 + 7.5290 -0.6580 -1.5530 H 0 0 0 0 0 0 0 0 0 0 0 0 + 3.0730 -3.7090 -0.6420 H 0 0 0 0 0 0 0 0 0 0 0 0 + 4.3200 -4.4850 -1.6480 H 0 0 0 0 0 0 0 0 0 0 0 0 + 4.5660 -4.3230 0.1070 H 0 0 0 0 0 0 0 0 0 0 0 0 + 1 2 1 0 0 0 0 + 1 52 1 0 0 0 0 + 2 3 2 0 0 0 0 + 2 4 1 0 0 0 0 + 4 5 1 0 0 0 0 + 4 9 1 0 0 0 0 + 4 70 1 0 0 0 0 + 5 6 1 0 0 0 0 + 5 71 1 0 0 0 0 + 5 72 1 0 0 0 0 + 6 7 1 0 0 0 0 + 6 73 1 0 0 0 0 + 6 74 1 0 0 0 0 + 7 8 1 0 0 0 0 + 7 75 1 0 0 0 0 + 7 76 1 0 0 0 0 + 8 9 1 0 0 0 0 + 8 77 1 0 0 0 0 + 8 78 1 0 0 0 0 + 9 10 1 0 0 0 0 + 10 11 2 0 0 0 0 + 10 12 1 0 0 0 0 + 12 13 2 0 0 0 0 + 12 14 1 0 0 0 0 + 14 15 1 0 0 0 0 + 14 19 1 0 0 0 0 + 14 20 1 0 0 0 0 + 15 16 1 0 0 0 0 + 15 21 1 0 0 0 0 + 15 79 1 0 0 0 0 + 16 17 1 0 0 0 0 + 16 80 1 0 0 0 0 + 16 81 1 0 0 0 0 + 17 18 1 0 0 0 0 + 17 82 1 0 0 0 0 + 17 83 1 0 0 0 0 + 18 19 1 0 0 0 0 + 18 22 1 0 0 0 0 + 18 84 1 0 0 0 0 + 20 85 1 0 0 0 0 + 21 86 1 0 0 0 0 + 21 87 1 0 0 0 0 + 21 88 1 0 0 0 0 + 22 23 1 0 0 0 0 + 22 89 1 0 0 0 0 + 22 90 1 0 0 0 0 + 23 24 1 0 0 0 0 + 23 25 1 0 0 0 0 + 23 91 1 0 0 0 0 + 24 67 2 0 0 0 0 + 24 69 1 0 0 0 0 + 25 26 1 0 0 0 0 + 25 27 2 0 0 0 0 + 26 92 1 0 0 0 0 + 26 93 1 0 0 0 0 + 26 94 1 0 0 0 0 + 27 28 1 0 0 0 0 + 27 95 1 0 0 0 0 + 28 29 2 0 0 0 0 + 28 96 1 0 0 0 0 + 29 30 1 0 0 0 0 + 29 97 1 0 0 0 0 + 30 31 2 0 0 0 0 + 30 98 1 0 0 0 0 + 31 32 1 0 0 0 0 + 31 99 1 0 0 0 0 + 32 33 1 0 0 0 0 + 32 34 1 0 0 0 0 + 32100 1 0 0 0 0 + 33101 1 0 0 0 0 + 33102 1 0 0 0 0 + 33103 1 0 0 0 0 + 34 35 1 0 0 0 0 + 34104 1 0 0 0 0 + 34105 1 0 0 0 0 + 35 36 1 0 0 0 0 + 35 37 1 0 0 0 0 + 35106 1 0 0 0 0 + 36107 1 0 0 0 0 + 36108 1 0 0 0 0 + 36109 1 0 0 0 0 + 37 38 2 0 0 0 0 + 37 39 1 0 0 0 0 + 39 40 1 0 0 0 0 + 39 42 1 0 0 0 0 + 39110 1 0 0 0 0 + 40 41 1 0 0 0 0 + 41111 1 0 0 0 0 + 41112 1 0 0 0 0 + 41113 1 0 0 0 0 + 42 43 1 0 0 0 0 + 42 44 1 0 0 0 0 + 42114 1 0 0 0 0 + 43115 1 0 0 0 0 + 44 45 1 0 0 0 0 + 44 46 2 0 0 0 0 + 45116 1 0 0 0 0 + 45117 1 0 0 0 0 + 45118 1 0 0 0 0 + 46 47 1 0 0 0 0 + 46119 1 0 0 0 0 + 47 48 1 0 0 0 0 + 47 49 1 0 0 0 0 + 47120 1 0 0 0 0 + 48121 1 0 0 0 0 + 48122 1 0 0 0 0 + 48123 1 0 0 0 0 + 49 50 2 0 0 0 0 + 49 51 1 0 0 0 0 + 51 52 1 0 0 0 0 + 51124 1 0 0 0 0 + 51125 1 0 0 0 0 + 52 53 1 0 0 0 0 + 52126 1 0 0 0 0 + 53 54 1 0 0 0 0 + 53 55 1 0 0 0 0 + 53127 1 0 0 0 0 + 54128 1 0 0 0 0 + 54129 1 0 0 0 0 + 54130 1 0 0 0 0 + 55 56 1 0 0 0 0 + 55131 1 0 0 0 0 + 55132 1 0 0 0 0 + 56 57 1 0 0 0 0 + 56 64 1 0 0 0 0 + 56133 1 0 0 0 0 + 57 58 1 0 0 0 0 + 57134 1 0 0 0 0 + 57135 1 0 0 0 0 + 58 59 1 0 0 0 0 + 58 61 1 0 0 0 0 + 58136 1 0 0 0 0 + 59 60 1 0 0 0 0 + 60137 1 0 0 0 0 + 60138 1 0 0 0 0 + 60139 1 0 0 0 0 + 61 62 1 0 0 0 0 + 61 63 1 0 0 0 0 + 61140 1 0 0 0 0 + 62141 1 0 0 0 0 + 63 64 1 0 0 0 0 + 63142 1 0 0 0 0 + 63143 1 0 0 0 0 + 64144 1 0 0 0 0 + 64145 1 0 0 0 0 + 65 66 2 0 0 0 0 + 65 68 1 0 0 0 0 + 65 69 1 0 0 0 0 + 66 67 1 0 0 0 0 + 66146 1 0 0 0 0 + 67147 1 0 0 0 0 + 68148 1 0 0 0 0 + 68149 1 0 0 0 0 + 68150 1 0 0 0 0 +M END +> +Cc1ccc(s1)[C@@H]\2C[C@@H]3CC[C@H]([C@@](O3)(C(=O)C(=O)N4CCCC[C@H]4C(=O)O[C@@H](CC(=O)[C@@H](/C=C(/[C@H]([C@H](C(=O)[C@@H](C[C@@H](/C=C/C=C/C=C2\C)C)C)OC)O)\C)C)[C@H](C)C[C@@H]5CC[C@H]([C@@H](C5)OC)O)O)C + +> +InChI=1S/C55H81NO12S/c1-32-16-12-11-13-17-33(2)42(48-24-20-39(8)69-48)30-41-22-19-38(7)55(64,68-41)52(61)53(62)56-25-15-14-18-43(56)54(63)67-46(35(4)28-40-21-23-44(57)47(29-40)65-9)31-45(58)34(3)27-37(6)50(60)51(66-10)49(59)36(5)26-32/h11-13,16-17,20,24,27,32,34-36,38,40-44,46-47,50-51,57,60,64H,14-15,18-19,21-23,25-26,28-31H2,1-10H3/b13-11+,16-12+,33-17+,37-27+/t32-,34-,35-,36-,38-,40+,41+,42-,43+,44-,46+,47-,50-,51+,55-/m1/s1 + +> +SDSGJAIFUCCAOV-MSLSVLDMSA-N + +> +C55H81NO12S + +$$$$ diff --git a/rf2aa/config/inference/protein_complex_sm.yaml b/rf2aa/config/inference/protein_complex_sm.yaml new file mode 100644 index 0000000..65b147c --- /dev/null +++ b/rf2aa/config/inference/protein_complex_sm.yaml @@ -0,0 +1,14 @@ +defaults: + - base +job_name: "3fap" + +protein_inputs: + A: + fasta_file: examples/protein/3fap_A.fasta + B: + fasta_file: examples/protein/3fap_B.fasta + +sm_inputs: + C: + input: examples/small_molecule/ARD_ideal.sdf + input_type: "sdf" \ No newline at end of file diff --git a/rf2aa/data/data_loader.py b/rf2aa/data/data_loader.py index 3ce36b3..fe68bdf 100644 --- a/rf2aa/data/data_loader.py +++ b/rf2aa/data/data_loader.py @@ -28,6 +28,10 @@ class RawInputData: def query_sequence(self): return self.msa[0] + def sequence_string(self): + three_letter_sequence = [ChemData().num2aa[num] for num in self.query_sequence()] + return "".join([ChemData().aa_321[three] for three in three_letter_sequence]) + def is_atom(self): return is_atom(self.query_sequence()) diff --git a/rf2aa/data/data_loader_utils.py b/rf2aa/data/data_loader_utils.py index 2f559d3..bd66579 100644 --- a/rf2aa/data/data_loader_utils.py +++ b/rf2aa/data/data_loader_utils.py @@ -548,7 +548,7 @@ def join_msas_by_taxid(a3mA, a3mB, idx_overlap=None): # pair sequences taxids_shared = a3mA['taxid'][np.isin(a3mA['taxid'],a3mB['taxid'])] i_pairedA, i_pairedB = [], [] - + for taxid in taxids_shared: i_match = np.where(a3mA['taxid']==taxid)[0] i_match_best = torch.argmin(torch.sum(a3mA['msa'][i_match]==a3mA['msa'][0], axis=1)) @@ -744,7 +744,7 @@ def load_minimal_multi_msa(hash_list, taxid_list, Ls, params): return a3m_out, hashes_out, Ls_out -def expand_multi_msa(a3m, hashes_in, hashes_out, Ls_in, Ls_out, params): +def expand_multi_msa(a3m, hashes_in, hashes_out, Ls_in, Ls_out): """Expands a multi-MSA of unique chains into an MSA of a hetero-homo-oligomer in which some chains appear more than once. The query sequences (1st sequence of MSA) are concatenated directly along the diff --git a/rf2aa/data/merge_inputs.py b/rf2aa/data/merge_inputs.py index 1d3e048..bf12ff8 100644 --- a/rf2aa/data/merge_inputs.py +++ b/rf2aa/data/merge_inputs.py @@ -1,6 +1,7 @@ import torch +from hashlib import md5 -from rf2aa.data.data_loader_utils import merge_a3m_hetero, merge_a3m_homo, merge_hetero_templates, get_term_feats +from rf2aa.data.data_loader_utils import merge_a3m_hetero, merge_a3m_homo, merge_hetero_templates, get_term_feats, join_msas_by_taxid, expand_multi_msa from rf2aa.data.data_loader import RawInputData from rf2aa.util import center_and_realign_missing, same_chain_from_bond_feats, random_rot_trans, idx_from_Ls @@ -18,7 +19,71 @@ def merge_protein_inputs(protein_inputs, deterministic: bool = False): # handle merging MSAs and such # first determine which sequence are identical, then which one have mergeable MSAs # then cat the templates, other feats - pass + else: + a3m_list = [ + {"msa": input.msa, + "ins": input.ins, + "taxid": input.taxids + } + for input in protein_inputs.values() + ] + hash_list = [md5(input.sequence_string().encode()).hexdigest() for input in protein_inputs.values()] + lengths_list = [input.length() for input in protein_inputs.values()] + + seen = set() + unique_indices = [] + for idx, hash in enumerate(hash_list): + if hash not in seen: + unique_indices.append(idx) + seen.add(hash) + + unique_a3m = [a3m for i, a3m in enumerate(a3m_list) if i in unique_indices ] + unique_hashes = [value for index, value in enumerate(hash_list) if index in unique_indices] + unique_lengths_list = [value for index, value in enumerate(lengths_list) if index in unique_indices] + + if len(unique_a3m) >1: + a3m_out = unique_a3m[0] + for i in range(1, len(unique_a3m)): + a3m_out = join_msas_by_taxid(a3m_out, a3m_list[i]) + a3m_out = expand_multi_msa(a3m_out, unique_hashes, hash_list, unique_lengths_list, lengths_list) + else: + a3m = unique_a3m[0] + msa, ins = a3m["msa"], a3m["ins"] + a3m_out = merge_a3m_homo(msa, ins, len(hash_list)) + + # merge templates + max_template_dim = max([input.xyz_t.shape[0] for input in protein_inputs.values()]) + xyz_t_list = [input.xyz_t for input in protein_inputs.values()] + mask_t_list = [input.mask_t for input in protein_inputs.values()] + t1d_list = [input.t1d for input in protein_inputs.values()] + ids = ["inference"] * len(t1d_list) + xyz_t, t1d, mask_t, _ = merge_hetero_templates(xyz_t_list, t1d_list, mask_t_list, ids, lengths_list, deterministic=deterministic) + + atom_frames = torch.zeros(0,3,2) + chirals = torch.zeros(0,5) + + + L_total = sum(lengths_list) + bond_feats = torch.zeros((L_total, L_total)).long() + offset = 0 + for bf in [input.bond_feats for input in protein_inputs.values()]: + L = bf.shape[0] + bond_feats[offset:offset+L, offset:offset+L] = bf + offset += L + chain_lengths = list(zip(protein_inputs.keys(), lengths_list)) + + merged_input = RawInputData( + a3m_out["msa"], + a3m_out["ins"], + bond_feats, + xyz_t[:max_template_dim], + mask_t[:max_template_dim], + t1d[:max_template_dim], + chirals, + atom_frames, + taxids=None + ) + return merged_input, chain_lengths def merge_na_inputs(na_inputs): # should just be trivially catting features @@ -101,14 +166,6 @@ def merge_all( deterministic: bool = False, ): - #protein_lengths = [protein_input.length() for protein_input in protein_inputs.values()] - #na_lengths = [na_input.length() for na_input in na_inputs.values()] - #sm_lengths = [sm_input.length() for sm_input in sm_inputs.values()] - #all_lengths = protein_lengths + na_lengths + sm_lengths - - #term_info = get_term_feats(all_lengths) - #term_info[sum(protein_lengths):, :] = 0 - protein_inputs, protein_chain_lengths = merge_protein_inputs(protein_inputs, deterministic=deterministic) na_inputs, na_chain_lengths = merge_na_inputs(na_inputs) diff --git a/rf2aa/data/parsers.py b/rf2aa/data/parsers.py index c7b7820..86f8ac0 100644 --- a/rf2aa/data/parsers.py +++ b/rf2aa/data/parsers.py @@ -414,18 +414,21 @@ def parse_a3m(filename, maxseq=8000, paired=False): else: fstream = open(filename, 'r') - for line in fstream: + for i, line in enumerate(fstream): # skip labels if line[0] == '>': if paired: # paired MSAs only have a TAXID in the fasta header taxIDs.append(line[1:].strip()) else: # unpaired MSAs have all the metadata so use regex to pull out TAXID - match = re.search( r'TaxID=(\d+)', line) - if match: - taxIDs.append(match.group(1)) + if i == 0: + taxIDs.append("query") else: - taxIDs.append("query") # query sequence + match = re.search( r'TaxID=(\d+)', line) + if match: + taxIDs.append(match.group(1)) + else: + taxIDs.append("") # query sequence continue # remove right whitespaces diff --git a/rf2aa/data/preprocessing.py b/rf2aa/data/preprocessing.py index 5433e8b..8624d55 100644 --- a/rf2aa/data/preprocessing.py +++ b/rf2aa/data/preprocessing.py @@ -8,11 +8,12 @@ import subprocess def make_msa( fasta_file, + chain, model_runner ): out_dir_base = Path(model_runner.config.output_path) hash = model_runner.config.job_name - out_dir = out_dir_base / hash + out_dir = out_dir_base / hash / chain out_dir.mkdir(parents=True, exist_ok=True) command = model_runner.config.database_params.command diff --git a/rf2aa/data/protein.py b/rf2aa/data/protein.py index d9e0cdc..a944bc8 100644 --- a/rf2aa/data/protein.py +++ b/rf2aa/data/protein.py @@ -88,6 +88,6 @@ def load_protein(msa_file, hhr_fn, atab_fn, model_runner): taxids=taxIDs, ) -def generate_msa_and_load_protein(fasta_file, model_runner): - msa_file, hhr_file, atab_file = make_msa(fasta_file, model_runner) +def generate_msa_and_load_protein(fasta_file, chain, model_runner): + msa_file, hhr_file, atab_file = make_msa(fasta_file, chain, model_runner) return load_protein(str(msa_file), str(hhr_file), str(atab_file), model_runner) diff --git a/rf2aa/run_inference.py b/rf2aa/run_inference.py index 01f50e8..e25f322 100644 --- a/rf2aa/run_inference.py +++ b/rf2aa/run_inference.py @@ -45,6 +45,7 @@ class ModelRunner: chains.append(chain) protein_input = generate_msa_and_load_protein( self.config.protein_inputs[chain]["fasta_file"], + chain, self ) protein_inputs[chain] = protein_input diff --git a/rf2aa/test_pickles/model/legacy_train_na_compl_regression.pt b/rf2aa/test_pickles/model/legacy_train_na_compl_regression.pt deleted file mode 100644 index d4f473b..0000000 Binary files a/rf2aa/test_pickles/model/legacy_train_na_compl_regression.pt and /dev/null differ diff --git a/rf2aa/test_pickles/model/legacy_train_rna_regression.pt b/rf2aa/test_pickles/model/legacy_train_rna_regression.pt deleted file mode 100644 index 876c8b2..0000000 Binary files a/rf2aa/test_pickles/model/legacy_train_rna_regression.pt and /dev/null differ diff --git a/rf2aa/test_pickles/model/legacy_train_sm_compl_covale_regression.pt b/rf2aa/test_pickles/model/legacy_train_sm_compl_covale_regression.pt deleted file mode 100644 index b04fd40..0000000 Binary files a/rf2aa/test_pickles/model/legacy_train_sm_compl_covale_regression.pt and /dev/null differ diff --git a/rf2aa/test_pickles/model/legacy_train_sm_compl_regression.pt b/rf2aa/test_pickles/model/legacy_train_sm_compl_regression.pt deleted file mode 100644 index 960849b..0000000 Binary files a/rf2aa/test_pickles/model/legacy_train_sm_compl_regression.pt and /dev/null differ diff --git a/rf2aa/tests/test_conditions.py b/rf2aa/tests/test_conditions.py deleted file mode 100644 index 36f77e2..0000000 --- a/rf2aa/tests/test_conditions.py +++ /dev/null @@ -1,79 +0,0 @@ -import torch -import pandas as pd -import numpy as np -import itertools -from collections import OrderedDict -from hydra import initialize, compose - -from rf2aa.setup_model import trainer_factory, seed_all -from rf2aa.chemical import ChemicalData as ChemData - -# configurations to test -configs = ["legacy_train"] -datasets = ["compl", "na_compl", "rna", "sm_compl", "sm_compl_covale", "sm_compl_asmb"] - -cfg_overrides = [ - "loader_params.p_msa_mask=0.0", - "loader_params.crop=100000", - "loader_params.mintplt=0", - "loader_params.maxtplt=2" -] - -def make_deterministic(seed=0): - seed_all(seed) - if torch.cuda.is_available(): - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - -def setup_dataset_names(): - data = {} - for name in datasets: - data[name] = [name] - return data - -# set up models for regression tests -def setup_models(device="cpu"): - models, chem_cfgs = [], [] - for config in configs: - with initialize(version_base=None, config_path="../config/train"): - cfg = compose(config_name=config, overrides=cfg_overrides) - - # initializing the model needs the chemical DB initialized. Force a reload - ChemData.reset() - ChemData(cfg.chem_params) - - trainer = trainer_factory[cfg.experiment.trainer](cfg) - seed_all() - trainer.construct_model(device=device) - models.append(trainer.model) - chem_cfgs.append(cfg.chem_params) - trainer = None - - return dict(zip(configs, (zip(configs, models, chem_cfgs)))) - -# set up job array for regression -def setup_array(datasets, models, device="cpu"): - test_data = setup_dataset_names() - test_models = setup_models(device=device) - test_data = [test_data[dataset] for dataset in datasets] - test_models = [test_models[model] for model in models] - return (list(itertools.product(test_data, test_models))) - -def random_param_init(model): - seed_all() - with torch.no_grad(): - fake_state_dict = OrderedDict() - for name, param in model.model.named_parameters(): - fake_state_dict[name] = torch.randn_like(param) - model.model.load_state_dict(fake_state_dict) - model.shadow.load_state_dict(fake_state_dict) - return model - -def dataset_pickle_path(dataset_name): - return f"test_pickles/data/{dataset_name}_regression.pt" - -def model_pickle_path(dataset_name, model_name): - return f"test_pickles/model/{model_name}_{dataset_name}_regression.pt" - -def loss_pickle_path(dataset_name, model_name, loss_name): - return f"test_pickles/loss/{loss_name}_{model_name}_{dataset_name}_regression.pt" \ No newline at end of file diff --git a/rf2aa/tests/test_model.py b/rf2aa/tests/test_model.py deleted file mode 100644 index ec53c43..0000000 --- a/rf2aa/tests/test_model.py +++ /dev/null @@ -1,73 +0,0 @@ -import os -import torch -import pytest -import warnings -warnings.filterwarnings("ignore") - -from rf2aa.data.dataloader_adaptor import prepare_input -from rf2aa.training.recycling import run_model_forward_legacy -from rf2aa.tensor_util import assert_equal -from rf2aa.tests.test_conditions import setup_array,\ - make_deterministic, dataset_pickle_path, model_pickle_path -from rf2aa.util_module import XYZConverter -from rf2aa.chemical import ChemicalData as ChemData - - -# goal is to test all the configs on a broad set of datasets - -gpu = "cuda:0" if torch.cuda.is_available() else "cpu" - -legacy_test_conditions = setup_array(["na_compl", "rna", "sm_compl", "sm_compl_covale"], ["legacy_train"], device=gpu) - -@pytest.mark.parametrize("example,model", legacy_test_conditions) -def test_regression_legacy(example, model): - dataset_name, dataset_inputs, model_name, model = setup_test(example, model) - make_deterministic() - output_i = run_model_forward_legacy(model, dataset_inputs, gpu) - model_pickle = model_pickle_path(dataset_name, model_name) - output_names = ("logits_c6d", "logits_aa", "logits_pae", \ - "logits_pde", "p_bind", "xyz", "alpha", "xyz_allatom", \ - "lddt", "seq", "pair", "state") - - if not os.path.exists(model_pickle): - torch.save(output_i, model_pickle) - else: - output_regression = torch.load(model_pickle, map_location=gpu) - for idx, output in enumerate(output_i): - got = output - want = output_regression[idx] - if output_names[idx] == "logits_c6d": - for i in range(len(want)): - - got_i = got[i] - want_i = want[i] - try: - assert_equal(got_i, want_i) - except Exception as e: - raise ValueError(f"{output_names[idx]} not same for model: {model_name} on dataset: {dataset_name}") from e - elif output_names[idx] in ["alpha", "xyz_allatom", "seq", "pair", "state"]: - try: - assert torch.allclose(got, want, atol=1e-4) - except Exception as e: - raise ValueError(f"{output_names[idx]} not same for model: {model_name} on dataset: {dataset_name}") from e - else: - try: - assert_equal(got, want) - except Exception as e: - raise ValueError(f"{output_names[idx]} not same for model: {model_name} on dataset: {dataset_name}") from e - -def setup_test(example, model): - model_name, model, config = model - - # initialize chemical database - ChemData.reset() # force reload chemical data - ChemData(config) - - model = model.to(gpu) - dataset_name = example[0] - dataloader_inputs = torch.load(dataset_pickle_path(dataset_name), map_location=gpu) - xyz_converter = XYZConverter().to(gpu) - task, item, network_input, true_crds, mask_crds, msa, mask_msa, unclamp, \ - negative, symmRs, Lasu, ch_label = prepare_input(dataloader_inputs,xyz_converter, gpu) - return dataset_name, network_input, model_name, model -