Commit 3cd38948 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Fixes to nblist featurization

parent f4e4f4c8
Loading
Loading
Loading
Loading
+61 −4
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer
from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer
from deepchem.featurizers.grid_featurizer import GridFeaturizer
from deepchem.featurizers.atomic_coordinates import NeighborListComplexAtomicCoordinates

def load_pdbbind_labels(labels_file):
  """Loads pdbbind labels as dataframe"""
@@ -44,9 +45,7 @@ def compute_pdbbind_grid_feature(compound_featurizers, complex_featurizers,
  for complex_featurizer in complex_featurizers:
    features = complex_featurizer.featurize_complexes(
      [ligand_file], [protein_file])
    ################################################ DEBUG
    all_features.append(np.squeeze(features))
    ################################################ DEBUG
  
  for compound_featurizer in compound_featurizers:
    features = np.squeeze(compound_featurizer.featurize([rdkit_mol]))
@@ -55,6 +54,66 @@ def compute_pdbbind_grid_feature(compound_featurizers, complex_featurizers,
  features = np.concatenate(all_features)
  return features

def compute_pdbbind_coordinate_features(
    complex_featurizer, pdb_subdir, pdb_code):
  """Compute features for a given complex"""
  protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
  ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)

  feature = complex_featurizer.featurize_complexes(
    [ligand_file], [protein_file])
  return feature

def load_core_pdbbind_coordinates(pdbbind_dir, base_dir, reload=True):
  """Load PDBBind datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False
  neighbor_cutoff = 4
  max_num_neighbors = 10

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load PDBBind dataset
  labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
  pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
  tasks = ["-logKd/Ki"]
  print("About to load contents.")
  contents_df = load_pdbbind_labels(labels_file)
  ids = contents_df["PDB code"].values
  y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

  # Define featurizers
  featurizer = NeighborListComplexAtomicCoordinates(
      max_num_neighbors, neighbor_cutoff)
  
  # Featurize Dataset
  features = []
  for ind, pdb_code in enumerate(ids):
    print("Processing %s" % str(pdb_code))
    pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
    computed_feature = compute_pdbbind_coordinate_features(
        featurizer, pdb_subdir, pdb_code)
    features.append(computed_feature)
  X = np.array(features, dtype-object)
  w = np.ones_like(y)
   
  dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
  transformers = []
  
  return tasks, dataset, transformers

def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
  """Load PDBBind datasets. Does not do train/test split"""
  # Set some global variables up top
@@ -113,9 +172,7 @@ def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
      continue
    y_inds.append(ind)
    features.append(computed_feature)
  ############################################################# DEBUG
  y = y[y_inds]
  ############################################################# DEBUG
  X = np.vstack(features)
  w = np.ones_like(y)
   
+1 −11
Original line number Diff line number Diff line
@@ -281,16 +281,6 @@ class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):
    """
    mol_coords, ob_mol = load_molecule(mol_pdb_file)
    protein_coords, protein_mol = load_molecule(protein_pdb_file)
    ########################################################## DEBUG
    print("mol_pdb_file, protein_pdb_file")
    print(mol_pdb_file, protein_pdb_file)
    print("type(mol_coords), type(ob_mol)")
    print(type(mol_coords), type(ob_mol))
    print("type(protein_coords), type(protein_mol)")
    print(type(protein_coords), type(protein_mol))
    print("mol_coords.shape, protein_coords.shape")
    print(mol_coords.shape, protein_coords.shape)
    ########################################################## DEBUG
    system_coords, system_mol = merge_molecules(
        mol_coords, ob_mol, protein_coords, protein_mol)
    
+16 −6
Original line number Diff line number Diff line
@@ -57,15 +57,25 @@ def load_molecule(molecule_file, remove_hydrogens=True,
    obConversion = ob.OBConversion()
    obConversion.SetInAndOutFormats(str("mol2"), str("mol2"))
    ob_mol = ob.OBMol()
    obConversion.ReadFile(ob_mol, molecule_file)
  else:
    obConversion.ReadFile(ob_mol, str(molecule_file))
  elif ".sdf" in molecule_file:
    ###################################################### DEBUG
    print("sdf file")
    ###################################################### DEBUG
    obConversion = ob.OBConversion()
    obConversion.SetInAndOutFormats(str("sdf"), str("sdf"))
    ob_mol = ob.OBMol()
    obConversion.ReadFile(ob_mol, str(molecule_file))
  elif ".pdb" in molecule_file:
    ###################################################### DEBUG
    print("non mol2 file")
    print("pdb file")
    ###################################################### DEBUG
    obConversion = ob.OBConversion()
    obConversion.SetInAndOutFormats(str("pdb"), str("pdb"))
    ob_mol = ob.OBMol()
    obConversion.ReadFile(ob_mol, str(molecule_file))
  else:
    raise ValueError("Unrecognized file type")
  ###################################################### DEBUG
  print("ob_mol.NumAtoms()")
  print(ob_mol.NumAtoms())
+4 −0
Original line number Diff line number Diff line
@@ -242,3 +242,7 @@ class TestAtomicCoordinates(unittest.TestCase):
    system_coords, system_neighbor_list = complex_featurizer._featurize_complex(
        ligand_file, protein_file)
  
    N = system_coords.shape[0]
    assert len(system_neighbor_list.keys()) == N
    for atom in range(N):
      assert len(system_neighbor_list[atom]) <= max_num_neighbors