Unverified Commit 516fb0a8 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1638 from VIGS25/smiles-to-img-fixes

ChemNet Fixes and Additions
parents 94809d39 c2cfbc45
Loading
Loading
Loading
Loading
+10 −0
Original line number Diff line number Diff line
@@ -201,6 +201,9 @@ class SmilesToImage(Featurizer):
      # Compute atom properties
      atom_props = np.array([[atom.GetAtomicNum()] for atom in cmol.GetAtoms()])

      bond_props = bond_props.astype(np.float32)
      atom_props = atom_props.astype(np.float32)

    else:
      # Setup image
      img = np.zeros((self.img_size, self.img_size, 4))
@@ -218,6 +221,13 @@ class SmilesToImage(Featurizer):
          atom.GetHybridization().real,
      ] for atom in cmol.GetAtoms()])

      bond_props = bond_props.astype(np.float32)
      atom_props = atom_props.astype(np.float32)

      partial_charges = atom_props[:, 1]
      if np.any(np.isnan(partial_charges)):
        return []

    frac = np.linspace(0, 1, int(1 / self.res * 2))
    # Reshape done for proper broadcast
    frac = frac.reshape(-1, 1, 1)
+46 −21
Original line number Diff line number Diff line
import numpy as np
import tensorflow as tf
import time
import logging
import os

logger = logging.getLogger(__name__)

from deepchem.data import NumpyDataset
from deepchem.models.losses import Loss
@@ -348,7 +352,6 @@ class KerasModel(Model):
      inputs, labels, weights = self._prepare_batch(batch)
      self._tensorboard_step += 1
      should_log = (
          self.tensorboard and
          self._tensorboard_step % self.tensorboard_log_frequency == 0)
      if tf.executing_eagerly():

@@ -401,7 +404,7 @@ class KerasModel(Model):
                  loss_tensor, global_step=self._global_step, var_list=vars)
            train_op = self._custom_train_op[op_key]
        fetches = [train_op, self._loss_tensor, self._global_step]
        if should_log:
        if self.tensorboard and should_log:
          fetches.append(self._summary_ops)
        feed_dict = dict(zip(self._input_placeholders, inputs))
        feed_dict.update(dict(zip(self._label_placeholders, labels)))
@@ -409,33 +412,36 @@ class KerasModel(Model):
        fetched_values = self.session.run(fetches, feed_dict=feed_dict)
        avg_loss += fetched_values[1]
        current_step = fetched_values[2]
        if should_log:

        if self.tensorboard and should_log:
          self._summary_writer.reopen()
          self._summary_writer.add_summary(
              fetched_values[3], global_step=current_step)
          self._summary_writer.close()

      # Report progress and write checkpoints.

      averaged_batches += 1
      if checkpoint_interval > 0 and current_step % checkpoint_interval == checkpoint_interval - 1:
        self._exec_with_session(lambda: manager.save())
      if should_log:
        avg_loss = float(avg_loss) / averaged_batches
        print(
        logger.info(
            'Ending global_step %d: Average loss %g' % (current_step, avg_loss))
        avg_loss = 0.0
        averaged_batches = 0

    # Report final results.
      if checkpoint_interval > 0 and current_step % checkpoint_interval == checkpoint_interval - 1:
        self._exec_with_session(lambda: manager.save())

    if checkpoint_interval > 0:
    # Report final results.
    if averaged_batches > 0:
      avg_loss = float(avg_loss) / averaged_batches
        print(
      logger.info(
          'Ending global_step %d: Average loss %g' % (current_step, avg_loss))

    if checkpoint_interval > 0:
      self._exec_with_session(lambda: manager.save())

    time2 = time.time()
      print("TIMING: model fitting took %0.3f s" % (time2 - time1))
    logger.info("TIMING: model fitting took %0.3f s" % (time2 - time1))
    return avg_loss

  def fit_on_batch(self, X, y, w, variables=None, loss=None):
@@ -898,7 +904,7 @@ class KerasModel(Model):
          pad_batches=pad_batches):
        yield ([X_b], [y_b], [w_b])

  def save_checkpoint(self, max_checkpoints_to_keep=5):
  def save_checkpoint(self, max_checkpoints_to_keep=5, model_dir=None):
    """Save a checkpoint to disk.

    Usually you do not need to call this method, since fit() saves checkpoints
@@ -909,9 +915,15 @@ class KerasModel(Model):
    ----------
    max_checkpoints_to_keep: int
      the maximum number of checkpoints to keep.  Older checkpoints are discarded.
    model_dir: str, default None
      Model directory to save checkpoint to. If None, revert to self.model_dir
    """
    self._ensure_built()
    manager = tf.train.CheckpointManager(self._checkpoint, self.model_dir,
    if model_dir is None:
      model_dir = self.model_dir
    if not os.path.exists(model_dir):
      os.makedirs(model_dir)
    manager = tf.train.CheckpointManager(self._checkpoint, model_dir,
                                         max_checkpoints_to_keep)
    self._exec_with_session(lambda: manager.save())

@@ -922,12 +934,20 @@ class KerasModel(Model):
      with self.session.as_default():
        f()

  def get_checkpoints(self):
    """Get a list of all available checkpoint files."""
    return tf.train.get_checkpoint_state(
        self.model_dir).all_model_checkpoint_paths
  def get_checkpoints(self, model_dir=None):
    """Get a list of all available checkpoint files.

  def restore(self, checkpoint=None):
    Parameters
    ----------
    model_dir: str, default None
      Directory to get list of checkpoints from. Reverts to self.model_dir if None

    """
    if model_dir is None:
      model_dir = self.model_dir
    return tf.train.get_checkpoint_state(model_dir).all_model_checkpoint_paths

  def restore(self, checkpoint=None, model_dir=None):
    """Reload the values of all variables from a checkpoint file.

    Parameters
@@ -936,10 +956,14 @@ class KerasModel(Model):
      the path to the checkpoint file to load.  If this is None, the most recent
      checkpoint will be chosen automatically.  Call get_checkpoints() to get a
      list of all available checkpoints.
    model_dir: str, default None
      Directory to restore checkpoint from. If None, use self.model_dir.
    """
    self._ensure_built()
    if model_dir is None:
      model_dir = self.model_dir
    if checkpoint is None:
      checkpoint = tf.train.latest_checkpoint(self.model_dir)
      checkpoint = tf.train.latest_checkpoint(model_dir)
    if checkpoint is None:
      raise ValueError('No checkpoint found')
    if tf.executing_eagerly():
@@ -975,5 +999,6 @@ class _StandardLoss(object):
        shape = w.shape
      shape = tuple(-1 if x is None else x for x in shape)
      w = tf.reshape(w, shape + (1,) * (len(losses.shape) - len(w.shape)))

    loss = losses * w
    return tf.reduce_mean(loss) + sum(self.model.losses)
+35 −9
Original line number Diff line number Diff line
@@ -52,6 +52,7 @@ def load_chembl25(featurizer="smiles2seq",
                  save_dir=None,
                  split_seed=None,
                  reload=True,
                  transformer_type='minmax',
                  **kwargs):
  """Loads the ChEMBL25 dataset, featurizes it, and does a split.
  Parameters
@@ -68,6 +69,8 @@ def load_chembl25(featurizer="smiles2seq",
    Seed to be used for splitting the dataset
  reload: bool, default True
    Whether to reload saved dataset
  transformer_type: str, default minmax:
    Transformer to use
  """
  if data_dir is None:
    data_dir = DEFAULT_DIR
@@ -121,10 +124,17 @@ def load_chembl25(featurizer="smiles2seq",
      input_files=[dataset_file], shard_size=10000, data_dir=save_folder)

  if split is None:
    transformer = [
    if transformer_type == "minmax":
      transformers = [
          dc.trans.MinMaxTransformer(
              transform_X=False, transform_y=True, dataset=dataset)
      ]
    else:
      transformers = [
          dc.trans.NormalizationTransformer(
              transform_X=False, transform_y=True, dataset=dataset)
      ]

    logger.info("Split is None, about to transform dataset.")
    for transformer in transformers:
      dataset = transformer.transform(dataset)
@@ -139,11 +149,27 @@ def load_chembl25(featurizer="smiles2seq",
  logger.info("About to split data with {} splitter.".format(split))
  splitter = splitters[split]

  train, valid, test = splitter.train_valid_test_split(dataset, seed=split_seed)
  frac_train = kwargs.get('frac_train', 4 / 6)
  frac_valid = kwargs.get('frac_valid', 1 / 6)
  frac_test = kwargs.get('frac_test', 1 / 6)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      seed=split_seed,
      frac_train=frac_train,
      frac_test=frac_test,
      frac_valid=frac_valid)
  if transformer_type == "minmax":
    transformers = [
        dc.trans.MinMaxTransformer(
            transform_X=False, transform_y=True, dataset=train)
    ]
  else:
    transformers = [
        dc.trans.NormalizationTransformer(
            transform_X=False, transform_y=True, dataset=train)
    ]

  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
+36 −13
Original line number Diff line number Diff line
@@ -10,29 +10,42 @@ import deepchem

logger = logging.getLogger(__name__)

HIV_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/HIV.csv'
DEFAULT_DIR = deepchem.utils.get_data_dir()

def load_hiv(featurizer='ECFP', split='index', reload=True, **kwargs):

def load_hiv(featurizer='ECFP',
             split='index',
             reload=True,
             data_dir=None,
             save_dir=None,
             **kwargs):
  """Load hiv datasets. Does not do train/test split"""
  # Featurize hiv dataset
  logger.info("About to featurize hiv dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    save_dir = os.path.join(data_dir, "hiv/" + featurizer + "/" + str(split))

  dataset_file = os.path.join(data_dir, "HIV.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/HIV.csv'
    )
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  hiv_tasks = ["HIV_active"]

  save_folder = os.path.join(save_dir, "hiv-featurized", str(featurizer),
                             str(split))
  if featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    save_folder = os.path.join(save_folder, img_spec)

  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
        save_folder)
    if loaded:
      return hiv_tasks, all_dataset, transformers

  dataset_file = os.path.join(data_dir, "HIV.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(url=HIV_URL, dest_dir=data_dir)

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
@@ -64,10 +77,20 @@ def load_hiv(featurizer='ECFP', split='index', reload=True, **kwargs):
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter(),
      'butina': deepchem.splits.ButinaSplitter()
      'butina': deepchem.splits.ButinaSplitter(),
      'stratified': deepchem.splits.RandomStratifiedSplitter()
  }
  splitter = splitters[split]
  logger.info("About to split dataset with {} splitter.".format(split))
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)
  train, valid, test = splitter.train_valid_test_split(dataset)

  transformers = [
@@ -81,6 +104,6 @@ def load_hiv(featurizer='ECFP', split='index', reload=True, **kwargs):
    test = transformer.transform(test)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
  return hiv_tasks, (train, valid, test), transformers
+33 −12
Original line number Diff line number Diff line
@@ -10,35 +10,47 @@ import deepchem

logger = logging.getLogger(__name__)

SAMPL_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/SAMPL.csv'
DEFAULT_DIR = deepchem.utils.get_data_dir()


def load_sampl(featurizer='ECFP',
               split='index',
               reload=True,
               move_mean=True,
               data_dir=None,
               save_dir=None,
               **kwargs):
  """Load SAMPL datasets."""
  # Featurize SAMPL dataset
  logger.info("About to featurize SAMPL dataset.")
  logger.info("About to load SAMPL dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if reload:

  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  if move_mean:
      dir_name = "sampl/" + featurizer + "/" + str(split)
    save_folder = os.path.join(data_dir, "sampl-featurized", str(featurizer),
                               str(split))
  else:
      dir_name = "sampl/" + featurizer + "_mean_unmoved/" + str(split)
    save_dir = os.path.join(data_dir, dir_name)
    save_folder = os.path.join(data_dir, "sampl-featurized",
                               str(featurizer) + "_mean_unmoved", str(split))

  if featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    save_folder = os.path.join(save_folder, img_spec)

  dataset_file = os.path.join(data_dir, "SAMPL.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/SAMPL.csv'
    )
    deepchem.utils.download_url(url=SAMPL_URL, dest_dir=data_dir)

  SAMPL_tasks = ['expt']

  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
        save_folder)
    if loaded:
      return SAMPL_tasks, all_dataset, transformers

@@ -77,6 +89,15 @@ def load_sampl(featurizer='ECFP',
  }
  splitter = splitters[split]
  logger.info("About to split dataset with {} splitter.".format(split))
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)
  train, valid, test = splitter.train_valid_test_split(dataset)

  transformers = [
@@ -91,6 +112,6 @@ def load_sampl(featurizer='ECFP',
    test = transformer.transform(test)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
  return SAMPL_tasks, (train, valid, test), transformers
Loading