Unverified Commit d0fac17d authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #893 from lilleswing/rdkit-upgrade

[READY TO MERGE] rdkit upgrade
parents faab4d73 09aa2a2d
Loading
Loading
Loading
Loading
+18 −0
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ from pandas import read_hdf
import tempfile
import time
import shutil
import json
from multiprocessing.dummy import Pool

__author__ = "Bharath Ramsundar"
@@ -432,6 +433,23 @@ class NumpyDataset(Dataset):
    """
    return NumpyDataset(ds.X, ds.y, ds.w, ds.ids)

  @staticmethod
  def to_json(self, fname):
    d = {
        'X': self.X.tolist(),
        'y': self.y.tolist(),
        'w': self.w.tolist(),
        'ids': self.ids.tolist()
    }
    with open(fname, 'w') as fout:
      json.dump(d, fout)

  @staticmethod
  def from_json(fname):
    with open(fname) as fin:
      d = json.load(fin)
      return NumpyDataset(d['X'], d['y'], d['w'], d['ids'])


class DiskDataset(Dataset):
  """
+7 −13
Original line number Diff line number Diff line
@@ -18,6 +18,8 @@ import numpy as np
import deepchem as dc
from sklearn.ensemble import RandomForestRegressor
from subprocess import call
from deepchem.utils import download_url
from deepchem.utils import get_data_dir


class TestPoseScoring(unittest.TestCase):
@@ -27,20 +29,14 @@ class TestPoseScoring(unittest.TestCase):

  def setUp(self):
    """Downloads dataset."""
    call(
        "wget -nv -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/featurized_datasets/core_grid.tar.gz".
        split())
    call("tar -zxvf core_grid.tar.gz".split())
    self.core_dataset = dc.data.DiskDataset("core_grid/")

  def tearDown(self):
    """Removes dataset"""
    call("rm -rf core_grid/".split())
    download_url(
        "http://deepchem.io.s3-website-us-west-1.amazonaws.com/featurized_datasets/core_grid.json"
    )
    json_fname = os.path.join(get_data_dir(), 'core_grid.json')
    self.core_dataset = dc.data.NumpyDataset.from_json(json_fname)

  def test_pose_scorer_init(self):
    """Tests that pose-score works."""
    if sys.version_info >= (3, 0):
      return
    sklearn_model = RandomForestRegressor(n_estimators=10)
    model = dc.models.SklearnModel(sklearn_model)
    print("About to fit model on core set")
@@ -50,8 +46,6 @@ class TestPoseScoring(unittest.TestCase):

  def test_pose_scorer_score(self):
    """Tests that scores are generated"""
    if sys.version_info >= (3, 0):
      return
    current_dir = os.path.dirname(os.path.realpath(__file__))
    protein_file = os.path.join(current_dir, "1jld_protein.pdb")
    ligand_file = os.path.join(current_dir, "1jld_ligand.sdf")
+33 −116
Original line number Diff line number Diff line
@@ -44,123 +44,41 @@ class RDKitDescriptors(Featurizer):

  # (ytz): This is done to avoid future compatibility issues like inclusion of
  # the 3D descriptors or changing the feature size.
  allowedDescriptors = set(['MaxAbsPartialCharge',
    'MinPartialCharge',
    'MinAbsPartialCharge',
    'HeavyAtomMolWt',
    'MaxAbsEStateIndex',
    'NumRadicalElectrons',
    'NumValenceElectrons',
    'MinAbsEStateIndex',
    'MaxEStateIndex',
    'MaxPartialCharge',
    'MinEStateIndex',
    'ExactMolWt',
    'MolWt',
    'BalabanJ',
    'BertzCT',
    'Chi0',
    'Chi0n',
    'Chi0v',
    'Chi1',
    'Chi1n',
    'Chi1v',
    'Chi2n',
    'Chi2v',
    'Chi3n',
    'Chi3v',
    'Chi4n',
    'Chi4v',
    'HallKierAlpha',
    'Ipc',
    'Kappa1',
    'Kappa2',
    'Kappa3',
    'LabuteASA',
    'PEOE_VSA1',
    'PEOE_VSA10',
    'PEOE_VSA11',
    'PEOE_VSA12',
    'PEOE_VSA13',
    'PEOE_VSA14',
    'PEOE_VSA2',
    'PEOE_VSA3',
    'PEOE_VSA4',
    'PEOE_VSA5',
    'PEOE_VSA6',
    'PEOE_VSA7',
    'PEOE_VSA8',
    'PEOE_VSA9',
    'SMR_VSA1',
    'SMR_VSA10',
    'SMR_VSA2',
    'SMR_VSA3',
    'SMR_VSA4',
    'SMR_VSA5',
    'SMR_VSA6',
    'SMR_VSA7',
    'SMR_VSA8',
    'SMR_VSA9',
    'SlogP_VSA1',
    'SlogP_VSA10',
    'SlogP_VSA11',
    'SlogP_VSA12',
    'SlogP_VSA2',
    'SlogP_VSA3',
    'SlogP_VSA4',
    'SlogP_VSA5',
    'SlogP_VSA6',
    'SlogP_VSA7',
    'SlogP_VSA8',
    'SlogP_VSA9',
    'TPSA',
    'EState_VSA1',
    'EState_VSA10',
    'EState_VSA11',
    'EState_VSA2',
    'EState_VSA3',
    'EState_VSA4',
    'EState_VSA5',
    'EState_VSA6',
    'EState_VSA7',
    'EState_VSA8',
    'EState_VSA9',
    'VSA_EState1',
    'VSA_EState10',
    'VSA_EState2',
    'VSA_EState3',
    'VSA_EState4',
    'VSA_EState5',
    'VSA_EState6',
    'VSA_EState7',
    'VSA_EState8',
    'VSA_EState9',
    'FractionCSP3',
    'HeavyAtomCount',
    'NHOHCount',
    'NOCount',
    'NumAliphaticCarbocycles',
    'NumAliphaticHeterocycles',
    'NumAliphaticRings',
    'NumAromaticCarbocycles',
    'NumAromaticHeterocycles',
    'NumAromaticRings',
    'NumHAcceptors',
    'NumHDonors',
    'NumHeteroatoms',
    'NumRotatableBonds',
    'NumSaturatedCarbocycles',
    'NumSaturatedHeterocycles',
    'NumSaturatedRings',
    'RingCount',
    'MolLogP',
    'MolMR'])

  allowedDescriptors = set([
      'MaxAbsPartialCharge', 'MinPartialCharge', 'MinAbsPartialCharge',
      'HeavyAtomMolWt', 'MaxAbsEStateIndex', 'NumRadicalElectrons',
      'NumValenceElectrons', 'MinAbsEStateIndex', 'MaxEStateIndex',
      'MaxPartialCharge', 'MinEStateIndex', 'ExactMolWt', 'MolWt', 'BalabanJ',
      'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n',
      'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc',
      'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10',
      'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2',
      'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7',
      'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3',
      'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9',
      'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2',
      'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7',
      'SlogP_VSA8', 'SlogP_VSA9', 'TPSA', 'EState_VSA1', 'EState_VSA10',
      'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4',
      'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9',
      'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3',
      'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8',
      'VSA_EState9', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount', 'NOCount',
      'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles',
      'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles',
      'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms',
      'NumRotatableBonds', 'NumSaturatedCarbocycles',
      'NumSaturatedHeterocycles', 'NumSaturatedRings', 'RingCount', 'MolLogP',
      'MolMR'
  ])

  def __init__(self):
    self.descriptors = []
    self.descList = []
    for descriptor, function in Descriptors.descList:
      if descriptor in self.allowedDescriptors:
        self.descriptors.append(descriptor)
        self.descList.append((descriptor, function))

  def _featurize(self, mol):
    """
@@ -172,7 +90,6 @@ class RDKitDescriptors(Featurizer):
        Molecule.
    """
    rval = []
    for desc_name, function in Descriptors.descList:
      if desc_name in self.allowedDescriptors:
    for desc_name, function in self.descList:
      rval.append(function(mol))
    return rval
+5 −2
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ class TestMolecularWeight(unittest.TestCase):
  """
  Test MolecularWeight.
  """

  def setUp(self):
    """
    Set up tests.
@@ -32,6 +33,7 @@ class TestRDKitDescriptors(unittest.TestCase):
  """
  Test RDKitDescriptors.
  """

  def setUp(self):
    """
    Set up tests.
@@ -46,5 +48,6 @@ class TestRDKitDescriptors(unittest.TestCase):
    """
    descriptors = self.engine([self.mol])
    assert np.allclose(
      descriptors[0, self.engine.descriptors.index('ExactMolWt')], 180,
        descriptors[0, self.engine.descriptors.index('ExactMolWt')],
        180,
        atol=0.1)
+8 −6
Original line number Diff line number Diff line
@@ -491,7 +491,7 @@ class PowerTransformer(Transformer):
  def untransform(self, z):
    # print("Cannot undo Power Transformer, for now.")
    n_powers = len(self.powers)
    orig_len = (z.shape[1]) / n_powers
    orig_len = (z.shape[1]) // n_powers
    z = z[:, :orig_len]
    z = np.power(z, 1 / self.powers[0])
    return z
@@ -736,8 +736,8 @@ class IRVTransformer():
    print('start similarity calculation')
    time1 = time.time()
    similarity = IRVTransformer.matrix_mul(X_target, np.transpose(self.X)) / (
        n_features - IRVTransformer.matrix_mul(1 - X_target,
                                               np.transpose(1 - self.X)))
        n_features -
        IRVTransformer.matrix_mul(1 - X_target, np.transpose(1 - self.X)))
    time2 = time.time()
    print('similarity calculation takes %i s' % (time2 - time1))
    for i in range(self.n_tasks):
@@ -784,8 +784,8 @@ class IRVTransformer():
    X_trans = []
    for count in range(X_length // 5000 + 1):
      X_trans.append(
          self.X_transform(dataset.X[count * 5000:min((count + 1) * 5000,
                                                      X_length), :]))
          self.X_transform(
              dataset.X[count * 5000:min((count + 1) * 5000, X_length), :]))
    X_trans = np.concatenate(X_trans, axis=0)
    return NumpyDataset(X_trans, dataset.y, dataset.w, ids=None)

@@ -993,7 +993,9 @@ class ANITransformer(Transformer):
        end = min((start + 1) * batch_size, X.shape[0])
        X_batch = X[(start * batch_size):end]
        output = self.sess.run(
            [self.outputs], feed_dict={self.inputs: X_batch})[0]
            [self.outputs], feed_dict={
                self.inputs: X_batch
            })[0]
        X_out.append(output)
        num_transformed = num_transformed + X_batch.shape[0]
        print('%i samples transformed' % num_transformed)
Loading