import torch import warnings import time from icecream import ic from torch.utils import data import os, csv, random, pickle, gzip, itertools, time, ast, copy, sys script_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(script_dir) sys.path.append(script_dir+'/../') import numpy as np import scipy import networkx as nx from rf2aa.data.parsers import parse_a3m, parse_pdb from rf2aa.chemical import ChemicalData as ChemData from rf2aa.util import random_rot_trans, \ is_atom, is_protein, is_nucleic, is_atom def MSABlockDeletion(msa, ins, nb=5): ''' Input: MSA having shape (N, L) output: new MSA with block deletion ''' N, L = msa.shape block_size = max(int(N*0.3), 1) block_start = np.random.randint(low=1, high=N, size=nb) # (nb) to_delete = block_start[:,None] + np.arange(block_size)[None,:] to_delete = np.unique(np.clip(to_delete, 1, N-1)) # mask = np.ones(N, bool) mask[to_delete] = 0 return msa[mask], ins[mask] def cluster_sum(data, assignment, N_seq, N_res): csum = torch.zeros(N_seq, N_res, data.shape[-1], device=data.device).scatter_add(0, assignment.view(-1,1,1).expand(-1,N_res,data.shape[-1]), data.float()) return csum def get_term_feats(Ls): """Creates N/C-terminus binary features""" term_info = torch.zeros((sum(Ls),2)).float() start = 0 for L_chain in Ls: term_info[start, 0] = 1.0 # flag for N-term term_info[start+L_chain-1,1] = 1.0 # flag for C-term start += L_chain return term_info def MSAFeaturize(msa, ins, params, p_mask=0.15, eps=1e-6, nmer=1, L_s=[], term_info=None, tocpu=False, fixbb=False, seed_msa_clus=None, deterministic=False): ''' Input: full MSA information (after Block deletion if necessary) & full insertion information Output: seed MSA features & extra sequences Seed MSA features: - aatype of seed sequence (20 regular aa + 1 gap/unknown + 1 mask) - profile of clustered sequences (22) - insertion statistics (2) - N-term or C-term? (2) extra sequence features: - aatype of extra sequence (22) - insertion info (1) - N-term or C-term? (2) ''' if deterministic: random.seed(0) np.random.seed(0) torch.manual_seed(0) torch.cuda.manual_seed(0) # TODO: delete me, just for testing purposes msa = msa[:2] if fixbb: p_mask = 0 msa = msa[:1] ins = ins[:1] N, L = msa.shape if term_info is None: if len(L_s)==0: L_s = [L] term_info = get_term_feats(L_s) term_info = term_info.to(msa.device) #binding_site = torch.zeros((L,1), device=msa.device).float() binding_site = torch.zeros((L,0), device=msa.device).float() # keeping this off for now (Jue 12/19) # raw MSA profile raw_profile = torch.nn.functional.one_hot(msa, num_classes=ChemData().NAATOKENS) # N x L x NAATOKENS raw_profile = raw_profile.float().mean(dim=0) # L x NAATOKENS # Select Nclust sequence randomly (seed MSA or latent MSA) Nclust = (min(N, params['MAXLAT'])-1) // nmer Nclust = Nclust*nmer + 1 if N > Nclust*2: Nextra = N - Nclust else: Nextra = N Nextra = min(Nextra, params['MAXSEQ']) // nmer Nextra = max(1, Nextra * nmer) # b_seq = list() b_msa_clust = list() b_msa_seed = list() b_msa_extra = list() b_mask_pos = list() for i_cycle in range(params['MAXCYCLE']): sample_mono = torch.randperm((N-1)//nmer, device=msa.device) sample = [sample_mono + imer*((N-1)//nmer) for imer in range(nmer)] sample = torch.stack(sample, dim=-1) sample = sample.reshape(-1) # add MSA clusters pre-chosen before calling this function if seed_msa_clus is not None: sample_orig_shape = sample.shape sample_seed = seed_msa_clus[i_cycle] sample_more = torch.tensor([i for i in sample if i not in sample_seed]) N_sample_more = len(sample) - len(sample_seed) if N_sample_more > 0: sample_more = sample_more[torch.randperm(len(sample_more))[:N_sample_more]] sample = torch.cat([sample_seed, sample_more]) else: sample = sample_seed[:len(sample)] # take all clusters from pre-chosen ones msa_clust = torch.cat((msa[:1,:], msa[1:,:][sample[:Nclust-1]]), dim=0) ins_clust = torch.cat((ins[:1,:], ins[1:,:][sample[:Nclust-1]]), dim=0) # 15% random masking # - 10%: aa replaced with a uniformly sampled random amino acid # - 10%: aa replaced with an amino acid sampled from the MSA profile # - 10%: not replaced # - 70%: replaced with a special token ("mask") random_aa = torch.tensor([[0.05]*20 + [0.0]*(ChemData().NAATOKENS-20)], device=msa.device) same_aa = torch.nn.functional.one_hot(msa_clust, num_classes=ChemData().NAATOKENS) # explicitly remove probabilities from nucleic acids and atoms #same_aa[..., ChemData().NPROTAAS:] = 0 #raw_profile[...,ChemData().NPROTAAS:] = 0 probs = 0.1*random_aa + 0.1*raw_profile + 0.1*same_aa #probs = torch.nn.functional.pad(probs, (0, 1), "constant", 0.7) # explicitly set the probability of masking for nucleic acids and atoms #probs[...,is_protein(seq),ChemData().MASKINDEX]=0.7 #probs[...,~is_protein(seq), :] = 0 # probably overkill but set all none protein elements to 0 #probs[1:, ~is_protein(seq),20] = 1.0 # want to leave the gaps as gaps #probs[0,is_nucleic(seq), ChemData().MASKINDEX] = 1.0 #probs[0,is_atom(seq), ChemData().aa2num["ATM"]] = 1.0 sampler = torch.distributions.categorical.Categorical(probs=probs) mask_sample = sampler.sample() mask_pos = torch.rand(msa_clust.shape, device=msa_clust.device) < p_mask mask_pos[msa_clust>ChemData().MASKINDEX]=False # no masking on NAs use_seq = msa_clust msa_masked = torch.where(mask_pos, mask_sample, use_seq) b_seq.append(msa_masked[0].clone()) ## get extra sequenes if N > Nclust*2: # there are enough extra sequences msa_extra = msa[1:,:][sample[Nclust-1:]] ins_extra = ins[1:,:][sample[Nclust-1:]] extra_mask = torch.full(msa_extra.shape, False, device=msa_extra.device) elif N - Nclust < 1: msa_extra = msa_masked.clone() ins_extra = ins_clust.clone() extra_mask = mask_pos.clone() else: msa_add = msa[1:,:][sample[Nclust-1:]] ins_add = ins[1:,:][sample[Nclust-1:]] mask_add = torch.full(msa_add.shape, False, device=msa_add.device) msa_extra = torch.cat((msa_masked, msa_add), dim=0) ins_extra = torch.cat((ins_clust, ins_add), dim=0) extra_mask = torch.cat((mask_pos, mask_add), dim=0) N_extra = msa_extra.shape[0] # clustering (assign remaining sequences to their closest cluster by Hamming distance msa_clust_onehot = torch.nn.functional.one_hot(msa_masked, num_classes=ChemData().NAATOKENS) msa_extra_onehot = torch.nn.functional.one_hot(msa_extra, num_classes=ChemData().NAATOKENS) count_clust = torch.logical_and(~mask_pos, msa_clust != 20).float() # 20: index for gap, ignore both masked & gaps count_extra = torch.logical_and(~extra_mask, msa_extra != 20).float() agreement = torch.matmul((count_extra[:,:,None]*msa_extra_onehot).view(N_extra, -1), (count_clust[:,:,None]*msa_clust_onehot).view(Nclust, -1).T) assignment = torch.argmax(agreement, dim=-1) # seed MSA features # 1. one_hot encoded aatype: msa_clust_onehot # 2. cluster profile count_extra = ~extra_mask count_clust = ~mask_pos msa_clust_profile = cluster_sum(count_extra[:,:,None]*msa_extra_onehot, assignment, Nclust, L) msa_clust_profile += count_clust[:,:,None]*msa_clust_profile count_profile = cluster_sum(count_extra[:,:,None], assignment, Nclust, L).view(Nclust, L) count_profile += count_clust count_profile += eps msa_clust_profile /= count_profile[:,:,None] # 3. insertion statistics msa_clust_del = cluster_sum((count_extra*ins_extra)[:,:,None], assignment, Nclust, L).view(Nclust, L) msa_clust_del += count_clust*ins_clust msa_clust_del /= count_profile ins_clust = (2.0/np.pi)*torch.arctan(ins_clust.float()/3.0) # (from 0 to 1) msa_clust_del = (2.0/np.pi)*torch.arctan(msa_clust_del.float()/3.0) # (from 0 to 1) ins_clust = torch.stack((ins_clust, msa_clust_del), dim=-1) # if fixbb: assert params['MAXCYCLE'] == 1 msa_clust_profile = msa_clust_onehot msa_extra_onehot = msa_clust_onehot ins_clust[:] = 0 ins_extra[:] = 0 # This is how it is done in rfdiff, but really it seems like it should be all 0. # Keeping as-is for now for consistency, as it may be used in downstream masking done # by apply_masks. mask_pos = torch.full_like(msa_clust, 1).bool() msa_seed = torch.cat((msa_clust_onehot, msa_clust_profile, ins_clust, term_info[None].expand(Nclust,-1,-1)), dim=-1) # extra MSA features ins_extra = (2.0/np.pi)*torch.arctan(ins_extra[:Nextra].float()/3.0) # (from 0 to 1) try: msa_extra = torch.cat((msa_extra_onehot[:Nextra], ins_extra[:,:,None], term_info[None].expand(Nextra,-1,-1)), dim=-1) except Exception as e: print('msa_extra.shape',msa_extra.shape) print('ins_extra.shape',ins_extra.shape) if (tocpu): b_msa_clust.append(msa_clust.cpu()) b_msa_seed.append(msa_seed.cpu()) b_msa_extra.append(msa_extra.cpu()) b_mask_pos.append(mask_pos.cpu()) else: b_msa_clust.append(msa_clust) b_msa_seed.append(msa_seed) b_msa_extra.append(msa_extra) b_mask_pos.append(mask_pos) b_seq = torch.stack(b_seq) b_msa_clust = torch.stack(b_msa_clust) b_msa_seed = torch.stack(b_msa_seed) b_msa_extra = torch.stack(b_msa_extra) b_mask_pos = torch.stack(b_mask_pos) return b_seq, b_msa_clust, b_msa_seed, b_msa_extra, b_mask_pos def blank_template(n_tmpl, L, random_noise=5.0, deterministic: bool = False): if deterministic: random.seed(0) np.random.seed(0) torch.manual_seed(0) torch.cuda.manual_seed(0) xyz = ChemData().INIT_CRDS.reshape(1,1,ChemData().NTOTAL,3).repeat(n_tmpl,L,1,1) \ + torch.rand(n_tmpl,L,1,3)*random_noise - random_noise/2 t1d = torch.nn.functional.one_hot(torch.full((n_tmpl, L), 20).long(), num_classes=ChemData().NAATOKENS-1).float() # all gaps conf = torch.zeros((n_tmpl, L, 1)).float() t1d = torch.cat((t1d, conf), -1) mask_t = torch.full((n_tmpl,L,ChemData().NTOTAL), False) return xyz, t1d, mask_t, np.full((n_tmpl), "") def TemplFeaturize(tplt, qlen, params, offset=0, npick=1, npick_global=None, pick_top=True, same_chain=None, random_noise=5, deterministic: bool = False): if deterministic: random.seed(0) np.random.seed(0) torch.manual_seed(0) torch.cuda.manual_seed(0) seqID_cut = params['SEQID'] if npick_global == None: npick_global=max(npick, 1) ntplt = len(tplt['ids']) if (ntplt < 1) or (npick < 1): #no templates in hhsearch file or not want to use templ return blank_template(npick_global, qlen, random_noise) # ignore templates having too high seqID if seqID_cut <= 100.0: tplt_valid_idx = torch.where(tplt['f0d'][0,:,4] < seqID_cut)[0] tplt['ids'] = np.array(tplt['ids'])[tplt_valid_idx] else: tplt_valid_idx = torch.arange(len(tplt['ids'])) # check again if there are templates having seqID < cutoff ntplt = len(tplt['ids']) npick = min(npick, ntplt) if npick<1: # no templates return blank_template(npick_global, qlen, random_noise) if not pick_top: # select randomly among all possible templates sample = torch.randperm(ntplt)[:npick] else: # only consider top 50 templates sample = torch.randperm(min(50,ntplt))[:npick] xyz = ChemData().INIT_CRDS.reshape(1,1,ChemData().NTOTAL,3).repeat(npick_global,qlen,1,1) + torch.rand(1,qlen,1,3)*random_noise mask_t = torch.full((npick_global,qlen,ChemData().NTOTAL),False) # True for valid atom, False for missing atom t1d = torch.full((npick_global, qlen), 20).long() t1d_val = torch.zeros((npick_global, qlen)).float() for i,nt in enumerate(sample): tplt_idx = tplt_valid_idx[nt] sel = torch.where(tplt['qmap'][0,:,1]==tplt_idx)[0] pos = tplt['qmap'][0,sel,0] + offset ntmplatoms = tplt['xyz'].shape[2] # will be bigger for NA templates xyz[i,pos,:ntmplatoms] = tplt['xyz'][0,sel] mask_t[i,pos,:ntmplatoms] = tplt['mask'][0,sel].bool() # 1-D features: alignment confidence t1d[i,pos] = tplt['seq'][0,sel] t1d_val[i,pos] = tplt['f1d'][0,sel,2] # alignment confidence # xyz[i] = center_and_realign_missing(xyz[i], mask_t[i], same_chain=same_chain) t1d = torch.nn.functional.one_hot(t1d, num_classes=ChemData().NAATOKENS-1).float() # (no mask token) t1d = torch.cat((t1d, t1d_val[...,None]), dim=-1) tplt_ids = np.array(tplt["ids"])[sample].flatten() # np.array of chain ids (ordered) return xyz, t1d, mask_t, tplt_ids def merge_hetero_templates(xyz_t_prot, f1d_t_prot, mask_t_prot, tplt_ids, Ls_prot, deterministic: bool = False): """Diagonally tiles template coordinates, 1d input features, and masks across template and residue dimensions. 1st template is concatenated directly on residue dimension after a random rotation & translation. """ N_tmpl_tot = sum([x.shape[0] for x in xyz_t_prot]) xyz_t_out, f1d_t_out, mask_t_out, _ = blank_template(N_tmpl_tot, sum(Ls_prot)) tplt_ids_out = np.full((N_tmpl_tot),"", dtype=object) # rk bad practice.. should fix i_tmpl = 0 i_res = 0 for xyz_, f1d_, mask_, ids in zip(xyz_t_prot, f1d_t_prot, mask_t_prot, tplt_ids): N_tmpl, L_tmpl = xyz_.shape[:2] if i_tmpl == 0: i1, i2 = 1, N_tmpl else: i1, i2 = i_tmpl, i_tmpl+N_tmpl - 1 # 1st template is concatenated directly, so that all atoms are set in xyz_prev xyz_t_out[0, i_res:i_res+L_tmpl] = random_rot_trans(xyz_[0:1], deterministic=deterministic) f1d_t_out[0, i_res:i_res+L_tmpl] = f1d_[0] mask_t_out[0, i_res:i_res+L_tmpl] = mask_[0] if not tplt_ids_out[0]: # only add first template tplt_ids_out[0] = ids[0] # remaining templates are diagonally tiled xyz_t_out[i1:i2, i_res:i_res+L_tmpl] = xyz_[1:] f1d_t_out[i1:i2, i_res:i_res+L_tmpl] = f1d_[1:] mask_t_out[i1:i2, i_res:i_res+L_tmpl] = mask_[1:] tplt_ids_out[i1:i2] = ids[1:] if i_tmpl == 0: i_tmpl += N_tmpl else: i_tmpl += N_tmpl-1 i_res += L_tmpl return xyz_t_out, f1d_t_out, mask_t_out, tplt_ids_out def generate_xyz_prev(xyz_t, mask_t, params): """ allows you to use different initializations for the coordinate track specified in params """ L = xyz_t.shape[1] if params["BLACK_HOLE_INIT"]: xyz_t, _, mask_t = blank_template(1, L) return xyz_t[0].clone(), mask_t[0].clone() ### merge msa & insertion statistics of two proteins having different taxID def merge_a3m_hetero(a3mA, a3mB, L_s): # merge msa query = torch.cat([a3mA['msa'][0], a3mB['msa'][0]]).unsqueeze(0) # (1, L) msa = [query] if a3mA['msa'].shape[0] > 1: extra_A = torch.nn.functional.pad(a3mA['msa'][1:], (0,sum(L_s[1:])), "constant", 20) # pad gaps msa.append(extra_A) if a3mB['msa'].shape[0] > 1: extra_B = torch.nn.functional.pad(a3mB['msa'][1:], (L_s[0],0), "constant", 20) msa.append(extra_B) msa = torch.cat(msa, dim=0) # merge ins query = torch.cat([a3mA['ins'][0], a3mB['ins'][0]]).unsqueeze(0) # (1, L) ins = [query] if a3mA['ins'].shape[0] > 1: extra_A = torch.nn.functional.pad(a3mA['ins'][1:], (0,sum(L_s[1:])), "constant", 0) # pad gaps ins.append(extra_A) if a3mB['ins'].shape[0] > 1: extra_B = torch.nn.functional.pad(a3mB['ins'][1:], (L_s[0],0), "constant", 0) ins.append(extra_B) ins = torch.cat(ins, dim=0) a3m = {'msa': msa, 'ins': ins} # merge taxids if 'taxid' in a3mA and 'taxid' in a3mB: a3m['taxid'] = np.concatenate([np.array(a3mA['taxid']), np.array(a3mB['taxid'])[1:]]) return a3m # merge msa & insertion statistics of units in homo-oligomers def merge_a3m_homo(msa_orig, ins_orig, nmer, mode="default"): N, L = msa_orig.shape[:2] if mode == "repeat": # AAAAAA # AAAAAA msa = torch.tile(msa_orig,(1,nmer)) ins = torch.tile(ins_orig,(1,nmer)) elif mode == "diag": # AAAAAA # A----- # -A---- # --A--- # ---A-- # ----A- # -----A N = N - 1 new_N = 1 + N * nmer new_L = L * nmer msa = torch.full((new_N, new_L), 20, dtype=msa_orig.dtype, device=msa_orig.device) ins = torch.full((new_N, new_L), 0, dtype=ins_orig.dtype, device=msa_orig.device) start_L = 0 start_N = 1 for i_c in range(nmer): msa[0, start_L:start_L+L] = msa_orig[0] msa[start_N:start_N+N, start_L:start_L+L] = msa_orig[1:] ins[0, start_L:start_L+L] = ins_orig[0] ins[start_N:start_N+N, start_L:start_L+L] = ins_orig[1:] start_L += L start_N += N else: # AAAAAA # A----- # -AAAAA msa = torch.full((2*N-1, L*nmer), 20, dtype=msa_orig.dtype, device=msa_orig.device) ins = torch.full((2*N-1, L*nmer), 0, dtype=ins_orig.dtype, device=msa_orig.device) msa[:N, :L] = msa_orig ins[:N, :L] = ins_orig start = L for i_c in range(1,nmer): msa[0, start:start+L] = msa_orig[0] msa[N:, start:start+L] = msa_orig[1:] ins[0, start:start+L] = ins_orig[0] ins[N:, start:start+L] = ins_orig[1:] start += L return msa, ins def merge_msas(a3m_list, L_s): """ takes a list of a3m dictionaries with keys msa, ins and a list of protein lengths and creates a combined MSA """ seen = set() taxIDs = [] a3mA = a3m_list[0] taxIDs.extend(a3mA["taxID"]) seen.update(a3mA["hash"]) msaA, insA = a3mA["msa"], a3mA["ins"] for i in range(1, len(a3m_list)): a3mB = a3m_list[i] pair_taxIDs = set(taxIDs).intersection(set(a3mB["taxID"])) if a3mB["hash"] in seen or len(pair_taxIDs) < 5: #homomer/not enough pairs a3mA = {"msa": msaA, "ins": insA} L_s_to_merge = [sum(L_s[:i]), L_s[i]] a3mA = merge_a3m_hetero(a3mA, a3mB, L_s_to_merge) msaA, insA = a3mA["msa"], a3mA["ins"] taxIDs.extend(a3mB["taxID"]) else: final_pairsA = [] final_pairsB = [] msaB, insB = a3mB["msa"], a3mB["ins"] for pair in pair_taxIDs: pair_a3mA = np.where(np.array(taxIDs)==pair)[0] pair_a3mB = np.where(a3mB["taxID"]==pair)[0] msaApair = torch.argmin(torch.sum(msaA[pair_a3mA, :] == msaA[0, :],axis=-1)) msaBpair = torch.argmin(torch.sum(msaB[pair_a3mB, :] == msaB[0, :],axis=-1)) final_pairsA.append(pair_a3mA[msaApair]) final_pairsB.append(pair_a3mB[msaBpair]) paired_msaB = torch.full((msaA.shape[0], L_s[i]), 20).long() # (N_seq_A, L_B) paired_msaB[final_pairsA] = msaB[final_pairsB] msaA = torch.cat([msaA, paired_msaB], dim=1) insA = torch.zeros_like(msaA) # paired MSAs in our dataset dont have insertions seen.update(a3mB["hash"]) return msaA, insA def remove_all_gap_seqs(a3m): """Removes sequences that are all gaps from an MSA represented as `a3m` dictionary""" idx_seq_keep = ~(a3m['msa']==ChemData().UNKINDEX).all(dim=1) a3m['msa'] = a3m['msa'][idx_seq_keep] a3m['ins'] = a3m['ins'][idx_seq_keep] return a3m def join_msas_by_taxid(a3mA, a3mB, idx_overlap=None): """Joins (or "pairs") 2 MSAs by matching sequences with the same taxonomic ID. If more than 1 sequence exists in both MSAs with the same tax ID, only the sequence with the highest sequence identity to the query (1st sequence in MSA) will be paired. Sequences that aren't paired will be padded and added to the bottom of the joined MSA. If a subregion of the input MSAs overlap (represent the same chain), the subregion residue indices can be given as `idx_overlap`, and the overlap region of the unpaired sequences will be included in the joined MSA. Parameters ---------- a3mA : dict First MSA to be joined, with keys `msa` (N_seq, L_seq), `ins` (N_seq, L_seq), `taxid` (N_seq,), and optionally `is_paired` (N_seq,), a boolean tensor indicating whether each sequence is fully paired. Can be a multi-MSA (contain >2 sub-MSAs). a3mB : dict 2nd MSA to be joined, with keys `msa`, `ins`, `taxid`, and optionally `is_paired`. Can be a multi-MSA ONLY if not overlapping with 1st MSA. idx_overlap : tuple or list (optional) Start and end indices of overlap region in 1st MSA, followed by the same in 2nd MSA. Returns ------- a3m : dict Paired MSA, with keys `msa`, `ins`, `taxid` and `is_paired`. """ # preprocess overlap region L_A, L_B = a3mA['msa'].shape[1], a3mB['msa'].shape[1] if idx_overlap is not None: i1A, i2A, i1B, i2B = idx_overlap i1B_new, i2B_new = (0, i1B) if i2B==L_B else (i2B, L_B) # MSA B residues that don't overlap MSA A assert((i1B==0) or (i2B==a3mB['msa'].shape[1])), \ "When overlapping with 1st MSA, 2nd MSA must comprise at most 2 sub-MSAs "\ "(i.e. residue range should include 0 or a3mB['msa'].shape[1])" else: i1B_new, i2B_new = (0, L_B) # pair sequences taxids_shared = a3mA['taxid'][np.isin(a3mA['taxid'],a3mB['taxid'])] i_pairedA, i_pairedB = [], [] for taxid in taxids_shared: i_match = np.where(a3mA['taxid']==taxid)[0] i_match_best = torch.argmin(torch.sum(a3mA['msa'][i_match]==a3mA['msa'][0], axis=1)) i_pairedA.append(i_match[i_match_best]) i_match = np.where(a3mB['taxid']==taxid)[0] i_match_best = torch.argmin(torch.sum(a3mB['msa'][i_match]==a3mB['msa'][0], axis=1)) i_pairedB.append(i_match[i_match_best]) # unpaired sequences i_unpairedA = np.setdiff1d(np.arange(a3mA['msa'].shape[0]), i_pairedA) i_unpairedB = np.setdiff1d(np.arange(a3mB['msa'].shape[0]), i_pairedB) N_paired, N_unpairedA, N_unpairedB = len(i_pairedA), len(i_unpairedA), len(i_unpairedB) # handle overlap region # if msa A consists of sub-MSAs 1,2,3 and msa B of 2,4 (i.e overlap region is 2), # this diagram shows how the variables below make up the final multi-MSA # (* denotes nongaps, - denotes gaps) # 1 2 3 4 # |*|*|*|*| msa_paired # |*|*|*|-| msaA_unpaired # |-|*|-|*| msaB_unpaired if idx_overlap is not None: assert((a3mA['msa'][i_pairedA, i1A:i2A]==a3mB['msa'][i_pairedB, i1B:i2B]) | (a3mA['msa'][i_pairedA, i1A:i2A]==ChemData().UNKINDEX)).all(),\ 'Paired MSAs should be identical (or 1st MSA should be all gaps) in overlap region' # overlap region gets sequences from 2nd MSA bc sometimes 1st MSA will be all gaps here msa_paired = torch.cat([a3mA['msa'][i_pairedA, :i1A], a3mB['msa'][i_pairedB, i1B:i2B], a3mA['msa'][i_pairedA, i2A:], a3mB['msa'][i_pairedB, i1B_new:i2B_new] ], dim=1) msaA_unpaired = torch.cat([a3mA['msa'][i_unpairedA], torch.full((N_unpairedA, i2B_new-i1B_new), ChemData().UNKINDEX) ], dim=1) msaB_unpaired = torch.cat([torch.full((N_unpairedB, i1A), ChemData().UNKINDEX), a3mB['msa'][i_unpairedB, i1B:i2B], torch.full((N_unpairedB, L_A-i2A), ChemData().UNKINDEX), a3mB['msa'][i_unpairedB, i1B_new:i2B_new] ], dim=1) else: # no overlap region, simple offset pad & stack # this code is actually a special case of "if" block above, but writing # this out explicitly here to make the logic more clear msa_paired = torch.cat([a3mA['msa'][i_pairedA], a3mB['msa'][i_pairedB, i1B_new:i2B_new]], dim=1) msaA_unpaired = torch.cat([a3mA['msa'][i_unpairedA], torch.full((N_unpairedA, L_B), ChemData().UNKINDEX)], dim=1) # pad with gaps msaB_unpaired = torch.cat([torch.full((N_unpairedB, L_A), ChemData().UNKINDEX), a3mB['msa'][i_unpairedB]], dim=1) # pad with gaps # stack paired & unpaired msa = torch.cat([msa_paired, msaA_unpaired, msaB_unpaired], dim=0) taxids = np.concatenate([a3mA['taxid'][i_pairedA], a3mA['taxid'][i_unpairedA], a3mB['taxid'][i_unpairedB]]) # label "fully paired" sequences (a row of MSA that was never padded with gaps) # output seq is fully paired if seqs A & B both started out as paired and were paired to # each other on tax ID. # NOTE: there is a rare edge case that is ignored here for simplicity: if # pMSA 0+1 and 1+2 are joined and then joined to 2+3, a seq that exists in # 0+1 and 2+3 but NOT 1+2 will become fully paired on the last join but # will not be labeled as such here is_pairedA = a3mA['is_paired'] if 'is_paired' in a3mA else torch.ones((a3mA['msa'].shape[0],)).bool() is_pairedB = a3mB['is_paired'] if 'is_paired' in a3mB else torch.ones((a3mB['msa'].shape[0],)).bool() is_paired = torch.cat([is_pairedA[i_pairedA] & is_pairedB[i_pairedB], torch.zeros((N_unpairedA + N_unpairedB,)).bool()]) # insertion features in paired MSAs are assumed to be zero a3m = dict(msa=msa, ins=torch.zeros_like(msa), taxid=taxids, is_paired=is_paired) return a3m def load_minimal_multi_msa(hash_list, taxid_list, Ls, params): """Load a multi-MSA, which is a MSA that is paired across more than 2 chains. This loads the MSA for unique chains. Use 'expand_multi_msa` to duplicate portions of the MSA for homo-oligomer repeated chains. Given a list of unique MSA hashes, loads all MSAs (using paired MSAs where it can) and pairs sequences across as many sub-MSAs as possible by matching taxonomic ID. For details on how pairing is done, see `join_msas_by_taxid()` Parameters ---------- hash_list : list of str Hashes of MSAs to load and join. Must not contain duplicates. taxid_list : list of str Taxonomic IDs of query sequences of each input MSA. Ls : list of int Lengths of the chains corresponding to the hashes. Returns ------- a3m_out : dict Multi-MSA with all input MSAs. Keys: `msa`,`ins` [torch.Tensor (N_seq, L)], `taxid` [np.array (Nseq,)], `is_paired` [torch.Tensor (N_seq,)] hashes_out : list of str Hashes of MSAs in the order that they are joined in `a3m_out`. Contains the same elements as the input `hash_list` but may be in a different order. Ls_out : list of int Lengths of each chain in `a3m_out` """ assert(len(hash_list)==len(set(hash_list))), 'Input MSA hashes must be unique' # the lists below are constructed such that `a3m_list[i_a3m]` is a multi-MSA # comprising sub-MSAs whose indices in the input lists are # `i_in = idx_list_groups[i_a3m][i_submsa]`, i.e. the sub-MSA hashes are # `hash_list[i_in]` and lengths are `Ls[i_in]`. # Each sub-MSA spans a region of its multi-MSA `a3m_list[i_a3m][:,i_start:i_end]`, # where `(i_start,i_end) = res_range_groups[i_a3m][i_submsa]` a3m_list = [] # list of multi-MSAs idx_list_groups = [] # list of lists of indices of input chains making up each multi-MSA res_range_groups = [] # list of lists of start and end residues of each sub-MSA in multi-MSA # iterate through all pairs of hashes and look for paired MSAs (pMSAs) # NOTE: in the below, if pMSAs are loaded for hashes 0+1 and then 2+3, and # later a pMSA is found for 0+2, the last MSA will not be loaded. The 0+1 # and 2+3 pMSAs will still be joined on taxID at the end, but sequences # only present in the 0+2 pMSA pMSAs will be missed. this is probably very # rare and so is ignored here for simplicity. N = len(hash_list) for i1, i2 in itertools.permutations(range(N),2): idx_list = [x for group in idx_list_groups for x in group] # flattened list of loaded hashes if i1 in idx_list and i2 in idx_list: continue # already loaded if i1 == '' or i2 == '': continue # no taxID means no pMSA # a paired MSA exists if taxid_list[i1]==taxid_list[i2]: h1, h2 = hash_list[i1], hash_list[i2] fn = params['COMPL_DIR']+'/pMSA/'+h1[:3]+'/'+h2[:3]+'/'+h1+'_'+h2+'.a3m.gz' if os.path.exists(fn): msa, ins, taxid = parse_a3m(fn, paired=True) a3m_new = dict(msa=torch.tensor(msa), ins=torch.tensor(ins), taxid=taxid, is_paired=torch.ones(msa.shape[0]).bool()) res_range1 = (0,Ls[i1]) res_range2 = (Ls[i1],msa.shape[1]) # both hashes are new, add paired MSA to list if i1 not in idx_list and i2 not in idx_list: a3m_list.append(a3m_new) idx_list_groups.append([i1,i2]) res_range_groups.append([res_range1, res_range2]) # one of the hashes is already in a multi-MSA # find that multi-MSA and join the new pMSA to it elif i1 in idx_list: # which multi-MSA & sub-MSA has the hash with index `i1`? i_a3m = np.where([i1 in group for group in idx_list_groups])[0][0] i_submsa = np.where(np.array(idx_list_groups[i_a3m])==i1)[0][0] idx_overlap = res_range_groups[i_a3m][i_submsa] + res_range1 a3m_list[i_a3m] = join_msas_by_taxid(a3m_list[i_a3m], a3m_new, idx_overlap) idx_list_groups[i_a3m].append(i2) L = res_range_groups[i_a3m][-1][1] # length of current multi-MSA L_new = res_range2[1] - res_range2[0] res_range_groups[i_a3m].append((L, L+L_new)) elif i2 in idx_list: # which multi-MSA & sub-MSA has the hash with index `i2`? i_a3m = np.where([i2 in group for group in idx_list_groups])[0][0] i_submsa = np.where(np.array(idx_list_groups[i_a3m])==i2)[0][0] idx_overlap = res_range_groups[i_a3m][i_submsa] + res_range2 a3m_list[i_a3m] = join_msas_by_taxid(a3m_list[i_a3m], a3m_new, idx_overlap) idx_list_groups[i_a3m].append(i1) L = res_range_groups[i_a3m][-1][1] # length of current multi-MSA L_new = res_range1[1] - res_range1[0] res_range_groups[i_a3m].append((L, L+L_new)) # add unpaired MSAs # ungroup hash indices now, since we're done making multi-MSAs idx_list = [x for group in idx_list_groups for x in group] for i in range(N): if i not in idx_list: fn = params['PDB_DIR'] + '/a3m/' + hash_list[i][:3] + '/' + hash_list[i] + '.a3m.gz' msa, ins, taxid = parse_a3m(fn) a3m_new = dict(msa=torch.tensor(msa), ins=torch.tensor(ins), taxid=taxid, is_paired=torch.ones(msa.shape[0]).bool()) a3m_list.append(a3m_new) idx_list.append(i) Ls_out = [Ls[i] for i in idx_list] hashes_out = [hash_list[i] for i in idx_list] # join multi-MSAs & unpaired MSAs a3m_out = a3m_list[0] for i in range(1, len(a3m_list)): a3m_out = join_msas_by_taxid(a3m_out, a3m_list[i]) return a3m_out, hashes_out, Ls_out def expand_multi_msa(a3m, hashes_in, hashes_out, Ls_in, Ls_out): """Expands a multi-MSA of unique chains into an MSA of a hetero-homo-oligomer in which some chains appear more than once. The query sequences (1st sequence of MSA) are concatenated directly along the residue dimention. The remaining sequences are offset-tiled (i.e. "padded & stacked") so that exact repeat sequences aren't paired. For example, if the original multi-MSA contains unique chains 1,2,3 but the final chain order is 1,2,1,3,3,1, this function will output an MSA like (where - denotes a block of gap characters): 1 2 - 3 - - - - 1 - 3 - - - - - - 1 Parameters ---------- a3m : dict Contains torch.Tensors `msa` and `ins` (N_seq, L) and np.array `taxid` (Nseq,), representing the multi-MSA of unique chains. hashes_in : list of str Unique MSA hashes used in `a3m`. hashes_out : list of str Non-unique MSA hashes desired in expanded MSA. Ls_in : list of int Lengths of each chain in `a3m` Ls_out : list of int Lengths of each chain desired in expanded MSA. params : dict Data loading parameters Returns ------- a3m : dict Contains torch.Tensors `msa` and `ins` of expanded MSA. No taxids because no further joining needs to be done. """ assert(len(hashes_out)==len(Ls_out)) assert(set(hashes_in)==set(hashes_out)) assert(a3m['msa'].shape[1]==sum(Ls_in)) # figure out which oligomeric repeat is represented by each hash in `hashes_out` # each new repeat will be offset in sequence dimension of final MSA counts = dict() n_copy = [] # n-th copy of this hash in `hashes` for h in hashes_out: if h in counts: counts[h] += 1 else: counts[h] = 1 n_copy.append(counts[h]) # num sequences in source & destination MSAs N_in = a3m['msa'].shape[0] N_out = (N_in-1)*max(n_copy)+1 # concatenate query seqs, pad&stack the rest # source MSA msa_in, ins_in = a3m['msa'], a3m['ins'] # initialize destination MSA to gap characters msa_out = torch.full((N_out, sum(Ls_out)), ChemData().UNKINDEX) ins_out = torch.full((N_out, sum(Ls_out)), 0) # for each destination chain for i_out, h_out in enumerate(hashes_out): # identify index of source chain i_in = np.where(np.array(hashes_in)==h_out)[0][0] # residue indexes i1_res_in = sum(Ls_in[:i_in]) i2_res_in = sum(Ls_in[:i_in+1]) i1_res_out = sum(Ls_out[:i_out]) i2_res_out = sum(Ls_out[:i_out+1]) # copy over query sequence # NOTE: There is a bug in these next two lines! # The second line should be ins_out[0, i1_res_out:i2_res_out] = ins_in[0, i1_res_in:i2_res_in] msa_out[0, i1_res_out:i2_res_out] = msa_in[0, i1_res_in:i2_res_in] ins_out[0, i1_res_out:i2_res_out] = msa_in[0, i1_res_in:i2_res_in] # offset non-query sequences along sequence dimension based on repeat number of a given hash i1_seq_out = 1+(n_copy[i_out]-1)*(N_in-1) i2_seq_out = 1+n_copy[i_out]*(N_in-1) # copy over non-query sequences msa_out[i1_seq_out:i2_seq_out, i1_res_out:i2_res_out] = msa_in[1:, i1_res_in:i2_res_in] ins_out[i1_seq_out:i2_seq_out, i1_res_out:i2_res_out] = ins_in[1:, i1_res_in:i2_res_in] # only 1st oligomeric repeat can be fully paired is_paired_out = torch.cat([a3m['is_paired'], torch.zeros((N_out-N_in,)).bool()]) a3m_out = dict(msa=msa_out, ins=ins_out, is_paired=is_paired_out) a3m_out = remove_all_gap_seqs(a3m_out) return a3m_out def load_multi_msa(chain_ids, Ls, chid2hash, chid2taxid, params): """Loads multi-MSA for an arbitrary number of protein chains. Tries to locate paired MSAs and pair sequences across all chains by taxonomic ID. Unpaired sequences are padded and stacked on the bottom. """ # get MSA hashes (used to locate a3m files) and taxonomic IDs (used to determine pairing) hashes = [] hashes_unique = [] taxids_unique = [] Ls_unique = [] for chid,L_ in zip(chain_ids, Ls): hashes.append(chid2hash[chid]) if chid2hash[chid] not in hashes_unique: hashes_unique.append(chid2hash[chid]) taxids_unique.append(chid2taxid.get(chid)) Ls_unique.append(L_) # loads multi-MSA for unique chains a3m_prot, hashes_unique, Ls_unique = \ load_minimal_multi_msa(hashes_unique, taxids_unique, Ls_unique, params) # expands multi-MSA to repeat chains of homo-oligomers a3m_prot = expand_multi_msa(a3m_prot, hashes_unique, hashes, Ls_unique, Ls, params) return a3m_prot def choose_multimsa_clusters(msa_seq_is_paired, params): """Returns indices of fully-paired sequences in a multi-MSA to use as seed clusters during MSA featurization. """ frac_paired = msa_seq_is_paired.float().mean() if frac_paired > 0.25: # enough fully paired sequences, just let MSAFeaturize choose randomly return None else: # ensure that half of the clusters are fully-paired sequences, # and let the rest be chosen randomly N_seed = params['MAXLAT']//2 msa_seed_clus = [] for i_cycle in range(params['MAXCYCLE']): idx_paired = torch.where(msa_seq_is_paired)[0] msa_seed_clus.append(idx_paired[torch.randperm(len(idx_paired))][:N_seed]) return msa_seed_clus #fd def get_bond_distances(bond_feats): atom_bonds = (bond_feats > 0)*(bond_feats<5) dist_matrix = scipy.sparse.csgraph.shortest_path(atom_bonds.long().numpy(), directed=False) # dist_matrix = torch.tensor(np.nan_to_num(dist_matrix, posinf=4.0)) # protein portion is inf and you don't want to mask it out return torch.from_numpy(dist_matrix).float() def get_pdb(pdbfilename, plddtfilename, item, lddtcut, sccut): xyz, mask, res_idx = parse_pdb(pdbfilename) plddt = np.load(plddtfilename) # update mask info with plddt (ignore sidechains if plddt < 90.0) mask_lddt = np.full_like(mask, False) mask_lddt[plddt > sccut] = True mask_lddt[:,:5] = True mask = np.logical_and(mask, mask_lddt) mask = np.logical_and(mask, (plddt > lddtcut)[:,None]) return {'xyz':torch.tensor(xyz), 'mask':torch.tensor(mask), 'idx': torch.tensor(res_idx), 'label':item} def get_msa(a3mfilename, item, maxseq=5000): msa,ins, taxIDs = parse_a3m(a3mfilename, maxseq=5000) return {'msa':torch.tensor(msa), 'ins':torch.tensor(ins), 'taxIDs':taxIDs, 'label':item}