Commit d52c0b9d authored by Yutong Zhao's avatar Yutong Zhao
Browse files

Save via numpy

parent a5033e34
Loading
Loading
Loading
Loading
+79 −0
Original line number Diff line number Diff line
@@ -4,8 +4,12 @@
Created on Thu Jul  6 20:31:47 2017

@author: zqwu
@contributors: ytz

"""
import os
import numpy as np
import json
import scipy.optimize
import tensorflow as tf

@@ -42,7 +46,9 @@ class BPSymmetryFunctionRegression(TensorGraph):
    self.max_atoms = max_atoms
    self.n_feat = n_feat
    self.layer_structures = layer_structures
    
    super(BPSymmetryFunctionRegression, self).__init__(**kwargs)

    self.build_graph()

  def build_graph(self):
@@ -126,6 +132,18 @@ class ANIRegression(TensorGraph):
    self.layer_structures = layer_structures
    self.atom_number_cases = atom_number_cases
    super(ANIRegression, self).__init__(**kwargs)


    # (ytz): this is really dirty but needed for restoring models
    self._kwargs = {
      "n_tasks": n_tasks,
      "max_atoms": max_atoms,
      "layer_structures": layer_structures,
      "atom_number_cases": atom_number_cases
    }

    self._kwargs.update(kwargs)

    self.build_graph()
    self.grad = None

@@ -359,3 +377,64 @@ class ANIRegression(TensorGraph):
        feed_dict[self.atom_numbers] = np.array(X_b[:, :, 0], dtype=int)
        feed_dict[self.atom_feats] = np.array(X_b[:, :, :], dtype=float)
        yield feed_dict

  def save_numpy(self):
    """
    Save to a portable numpy file. Note that this relies on the names to be consistent
    across different versions. The file is saved as save_pickle.npz under the model_dir.

    """
    path = os.path.join(self.model_dir, "save_pickle.npz")

    with self._get_tf("Graph").as_default():
      all_vars = tf.trainable_variables()
      all_vals = self.session.run(all_vars)
      save_dict = {}
      for idx, _ in enumerate(all_vars):
        save_dict[all_vars[idx].name] = all_vals[idx]

      save_dict["_kwargs"] = np.array([json.dumps(self._kwargs)], dtype=np.string_)

      np.savez(path, **save_dict)


  @classmethod
  def load_numpy(cls, model_dir):
    """
    Load from a portable numpy file.

    Parameters
    ----------
    model_dir: str
      Location of the model directory.

    """
    path = os.path.join(model_dir, "save_pickle.npz")
    npo = np.load(path)

    json_blob = npo["_kwargs"][0].decode('UTF-8')

    kwargs = json.loads(json_blob)

    obj = cls(**kwargs)
    obj.build()


    all_ops = []

    g = obj._get_tf("Graph")

    with g.as_default():
      all_vars = tf.trainable_variables()
      for k in npo.keys():

        if k == "_kwargs":
          continue

        val = npo[k]
        tensor = g.get_tensor_by_name(k)
        all_ops.append(tf.assign(tensor, val))

      obj.session.run(all_ops)

    return obj
+53 −15
Original line number Diff line number Diff line
import os
import unittest
import tempfile

import scipy
import numpy as np
import unittest

from deepchem.models import ANIRegression
import deepchem as dc

class TestANIRegression(unittest.TestCase):

  def test_gradients(self):
  def setUp(self):

    max_atoms = 3

@@ -25,20 +28,30 @@ class TestANIRegression(unittest.TestCase):
    layer_structures = [128, 128, 64]
    atom_number_cases = [1, 6, 7, 8]

    model = ANIRegression(
      1,
      max_atoms,
      layer_structures=layer_structures,
      atom_number_cases=atom_number_cases,
      batch_size=1,
      learning_rate=0.001,
      use_queue=False,
      mode="regression")
    self.model_dir = tempfile.mkdtemp()

    self.kwargs = {     
      "n_tasks":1,
      "max_atoms":max_atoms,
      "layer_structures":layer_structures,
      "atom_number_cases":atom_number_cases,
      "batch_size":1,
      "learning_rate":0.001,
      "use_queue":False,
      "mode":"regression",
      "model_dir": self.model_dir}

    model = ANIRegression(**self.kwargs)

    train_dataset = dc.data.NumpyDataset(X, y, n_tasks=1)

    model.fit(train_dataset, nb_epoch=2, checkpoint_interval=100)

    self.model = model


  def test_gradients(self):

    new_x = np.array([
      -2.0, 1.2, 2.1,
      1.3, -6.4, 3.1,
@@ -59,16 +72,16 @@ class TestANIRegression(unittest.TestCase):
      d_new_x_plus[idx] += delta
      d_new_x_minus = np.array(new_x)
      d_new_x_minus[idx] -= delta      
      dydx = (model.pred_one(d_new_x_plus, new_atomic_nums)-model.pred_one(d_new_x_minus, new_atomic_nums))/(2*delta)
      dydx = (self.model.pred_one(d_new_x_plus, new_atomic_nums)-self.model.pred_one(d_new_x_minus, new_atomic_nums))/(2*delta)
      grad_approx.append(dydx[0])

    grad_approx = np.array(grad_approx)

    grad_exact = model.grad_one(new_x, new_atomic_nums)
    grad_exact = self.model.grad_one(new_x, new_atomic_nums)

    np.testing.assert_array_almost_equal(grad_approx, grad_exact, decimal=3)

    grad_exact_constrained = model.grad_one(new_x, new_atomic_nums, constraints=[0, 2])
    grad_exact_constrained = self.model.grad_one(new_x, new_atomic_nums, constraints=[0, 2])

    assert grad_exact_constrained[0] == 0
    assert grad_exact_constrained[1] == 0
@@ -82,7 +95,7 @@ class TestANIRegression(unittest.TestCase):
    assert grad_exact_constrained[7] == 0
    assert grad_exact_constrained[8] == 0

    min_coords = model.minimize_structure(new_x, new_atomic_nums, constraints=[0,2])
    min_coords = self.model.minimize_structure(new_x, new_atomic_nums, constraints=[0,2])

    assert min_coords[0][0] == new_x[0]
    assert min_coords[0][1] == new_x[1]
@@ -96,6 +109,31 @@ class TestANIRegression(unittest.TestCase):
    assert min_coords[2][1] == new_x[7]
    assert min_coords[2][2] == new_x[8]

  def test_numpy_save_load(self):

    self.model.save_numpy()
    restored_model = ANIRegression.load_numpy(self.model_dir)

    new_x = np.array([
      -2.0, 1.2, 2.1,
      1.3, -6.4, 3.1,
      -2.5, 2.4, 5.6,
    ])

    new_atomic_nums = np.array([1,1,6])

    expected = self.model.pred_one(new_x, new_atomic_nums)
    predicted = restored_model.pred_one(new_x, new_atomic_nums)

    assert self.model.n_tasks == restored_model.n_tasks
    assert self.model.max_atoms == restored_model.max_atoms
    assert self.model.layer_structures == restored_model.layer_structures
    assert self.model.atom_number_cases == restored_model.atom_number_cases
    assert self.model.batch_size == restored_model.batch_size
    assert self.model.learning_rate == restored_model.learning_rate
    assert self.model.use_queue == restored_model.use_queue

    assert expected == predicted

if __name__ == '__main__':
  unittest.main()
 No newline at end of file
+32 −0
Original line number Diff line number Diff line
@@ -126,3 +126,35 @@ class anidataloader(object):

  def cleanup(self):
    self.store.close()

if __name__ == "__main__":
  base_dir = os.environ["ROITBERG_ANI"]

  # Number of conformations in each file increases exponentially.
  # Start with a smaller dataset before continuing. Use all of them
  # for production
  hdf5files = [
      'ani_gdb_s01.h5',
      'ani_gdb_s02.h5',
      'ani_gdb_s03.h5',
      'ani_gdb_s04.h5',
      'ani_gdb_s05.h5',
      'ani_gdb_s06.h5',
      'ani_gdb_s07.h5',
      'ani_gdb_s08.h5'
  ]

  hdf5files = [os.path.join(base_dir, f) for f in hdf5files]


  for hdf5file in hdf5files:
    print("processing", hdf5file)
    adl = anidataloader(hdf5file)
    for data in adl:

        # Extract the data
        P = data['path']
        R = data['coordinates']
        E = data['energies']
        S = data['species']
        smi = data['smiles']
 No newline at end of file
+18 −13
Original line number Diff line number Diff line
import numpy as np
import os

import tensorflow as tf

import deepchem as dc
import pyanitools as pya
import app
@@ -55,12 +57,12 @@ def load_roiterberg_ANI(mode="atomization"):
  hdf5files = [
      'ani_gdb_s01.h5',
      'ani_gdb_s02.h5',
      'ani_gdb_s03.h5',
      'ani_gdb_s04.h5',
      'ani_gdb_s05.h5',
      'ani_gdb_s06.h5',
      'ani_gdb_s07.h5',
      'ani_gdb_s08.h5'
      # 'ani_gdb_s03.h5',
      # 'ani_gdb_s04.h5',
      # 'ani_gdb_s05.h5',
      # 'ani_gdb_s06.h5',
      # 'ani_gdb_s07.h5',
      # 'ani_gdb_s08.h5'
  ]

  hdf5files = [os.path.join(base_dir, f) for f in hdf5files]
@@ -194,11 +196,11 @@ if __name__ == "__main__":
      dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
  ]

  model_dir = "/tmp/ani8.pkl"
  model_dir = "/tmp/ani8.npz"

  if os.path.exists(model_dir):
    print("Restoring existing model...")
    model = dc.models.ANIRegression.load_from_dir(model_dir=model_dir)
    model = dc.models.ANIRegression.load_numpy(model_dir=model_dir)
  else:
    print("Fitting new model...")

@@ -219,7 +221,6 @@ if __name__ == "__main__":

    print("Total training set shape: ", train_dataset.get_shape())


    for transformer in transformers:
      train_dataset = transformer.transform(train_dataset)
      valid_dataset = transformer.transform(valid_dataset)
@@ -236,12 +237,16 @@ if __name__ == "__main__":
        model_dir=model_dir,
        mode="regression")

    # For production, set nb_epoch to 100+
    for i in range(10):
      model.fit(train_dataset, nb_epoch=10, checkpoint_interval=100)
   #   # For production, set nb_epoch to 100+
    for i in range(1):
      model.fit(train_dataset, nb_epoch=1, checkpoint_interval=100)


    # model.save_numpy(save_path)
    # dc.models.ANIRegression.load_numpy(save_path, **kwargs)

      print("Saving model...")
      model.save()
      model.save_numpy()
      print("Done.")

    print("Evaluating model")