Commit aac23085 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Cleanup

parent 7fe6e64b
Loading
Loading
Loading
Loading
+2 −97
Original line number Diff line number Diff line
@@ -38,7 +38,6 @@ def compute_pdbbind_grid_feature(compound_featurizers, complex_featurizers,
  """Compute features for a given complex"""
  protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
  ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)
  #rdkit_mol = Chem.MolFromMol2File(str(ligand_file))
  rdkit_mol = Chem.SDMolSupplier(str(ligand_file)).next()

  all_features = []
@@ -47,45 +46,8 @@ def compute_pdbbind_grid_feature(compound_featurizers, complex_featurizers,
      [ligand_file], [protein_file])
    ################################################ DEBUG
    all_features.append(np.squeeze(features))
    print("type(features)")
    print(type(features))
    #all_features += features
    ################################################ DEBUG
  ################################################ DEBUG
  print("grid_featurizer outcome")
  print("all_features")
  print(all_features)
    ################################################ DEBUG
  
  for compound_featurizer in compound_featurizers:
    features = np.squeeze(compound_featurizer.featurize([rdkit_mol]))
    all_features.append(features)

  ################################################ DEBUG
  print("complex_featurizers, compound_featurizers")
  print(complex_featurizers, compound_featurizers)
  print("len(all_features)")
  print(len(all_features))
  print("[features.shape for features in all_features]")
  print([features.shape for features in all_features])
  ################################################ DEBUG
  features = np.concatenate(all_features)
  return features

def compute_pdbbind_atomic_coordinates(compound_featurizers, complex_featurizers,
                                       pdb_subdir, pdb_code):
  """Compute features for a given complex"""
  protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
  ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)
  #rdkit_mol = Chem.MolFromMol2File(str(ligand_file))
  rdkit_mol = Chem.SDMolSupplier(str(ligand_file)).next()

  all_features = []
  for complex_featurizer in complex_featurizers:
    features = complex_featurizer.featurize_complexes(
      [ligand_file], [protein_file])
    all_features.append(features)
  
  for compound_featurizer in compound_featurizers:
    features = np.squeeze(compound_featurizer.featurize([rdkit_mol]))
    all_features.append(features)
@@ -124,6 +86,8 @@ def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
  # Define featurizers
  grid_featurizer = GridFeaturizer(
      voxel_width=16.0, feature_types="voxel_combined",
      # TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi
      # causes segfaults.
      #voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
      #"salt_bridge"], ecfp_power=9, splif_power=9,
      voxel_feature_types=["ecfp", "splif", "hbond", 
@@ -150,10 +114,6 @@ def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
    y_inds.append(ind)
    features.append(computed_feature)
  ############################################################# DEBUG
  print("[feature.shape for feature in features]")
  print([feature.shape for feature in features])
  ############################################################# DEBUG
  ############################################################# DEBUG
  y = y[y_inds]
  ############################################################# DEBUG
  X = np.vstack(features)
@@ -163,58 +123,3 @@ def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
  transformers = []
  
  return tasks, dataset, transformers

def load_core_pdbbind_atomic_coordinates(pdbbind_dir, base_dir, reload=True):
  """Load PDBBind datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load PDBBind dataset
  labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
  pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
  tasks = ["-logKd/Ki"]
  print("About to load contents.")
  contents_df = load_pdbbind_labels(labels_file)
  ids = contents_df["PDB code"].values
  y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

  # Define featurizers
  grid_featurizer = GridFeaturizer(
      voxel_width=16.0, feature_types="voxel_combined",
      voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
      "salt_bridge"], ecfp_power=9, splif_power=9,
      parallel=True, flatten=True, verbosity=verbosity)
  compound_featurizers = [CircularFingerprint(size=1024)]
  complex_featurizers = [grid_featurizer]
  
  # Featurize Dataset
  features = []
  for pdb_code in ids:
    print("Processing %s" % str(pdb_code))
    pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
    computed_feature = compute_pdbbind_atomic_coordinates(
        compound_featurizers, complex_featurizers, pdb_subdir, pdb_code)
    if len(computed_feature) == 0:
      computed_feature = np.zeros(1024)
    features.append(computed_feature)
  X = np.vstack(features)
  w = np.ones_like(y)
   
  dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
  transformers = []
  
  return tasks, dataset, transformers