Commit 7fe2b751 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #443 from joegomes/hopv2

Harvard Organic Photovoltaic dataset + examples
parents 879ddaa9 8e2caa31
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
echo "Pulling HOPV dataset from deepchem"
wget http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/hopv.tar.gz
echo "Extracting HOPV dataset"
tar -zxvf hopv.tar.gz
+52 −0
Original line number Diff line number Diff line
"""
HOPV dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import shutil
import deepchem as dc


def load_hopv(featurizer='ECFP', split='index'):
  """Load HOPV datasets. Does not do train/test split"""
  # Featurize HOPV dataset
  print("About to featurize HOPV dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(current_dir, "hopv.csv")
  if not os.path.exists(dataset_file):
    os.system('sh ' + 'get_hopv.sh')

  hopv_tasks = [
      'HOMO', 'LUMO', 'electrochemical_gap', 'optical_gap', 'PCE', 'V_OC',
      'J_SC', 'fill_factor'
  ]
  if featurizer == 'ECFP':
    featurizer_func = dc.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer_func = dc.feat.ConvMolFeaturizer()
  loader = dc.data.CSVLoader(
      tasks=hopv_tasks, smiles_field="smiles", featurizer=featurizer_func)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  # Initialize transformers 
  transformers = [
      dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)
  ]

  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'scaffold': dc.splits.ScaffoldSplitter(),
      'butina': dc.splits.ButinaSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  return hopv_tasks, (train, valid, test), transformers
+64 −0
Original line number Diff line number Diff line
"""
Script that trains graph-conv models on HOPV dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import numpy as np
np.random.seed(123)
import tensorflow as tf
tf.set_random_seed(123)
import deepchem as dc
from hopv_datasets import load_hopv

# Load HOPV dataset
hopv_tasks, hopv_datasets, transformers = load_hopv(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = hopv_datasets

# Fit models
metric = [
    dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean, mode="regression"),
    dc.metrics.Metric(
        dc.metrics.mean_absolute_error, np.mean, mode="regression")
]

# Number of features on conv-mols
n_feat = 75
# Batch size of models
batch_size = 50
graph_model = dc.nn.SequentialGraph(n_feat)
graph_model.add(dc.nn.GraphConv(64, n_feat, activation='relu'))
graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
graph_model.add(dc.nn.GraphPool())
graph_model.add(dc.nn.GraphConv(64, 64, activation='relu'))
graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
graph_model.add(dc.nn.GraphPool())
# Gather Projection
graph_model.add(dc.nn.Dense(128, 64, activation='relu'))
graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
graph_model.add(dc.nn.GraphGather(batch_size, activation="tanh"))

model = dc.models.MultitaskGraphRegressor(
    graph_model,
    len(hopv_tasks),
    n_feat,
    batch_size=batch_size,
    learning_rate=1e-3,
    learning_rate_decay_time=1000,
    optimizer_type="adam",
    beta1=.9,
    beta2=.999)

# Fit trained model
model.fit(train_dataset, nb_epoch=25)

print("Evaluating model")
train_scores = model.evaluate(train_dataset, metric, transformers)
valid_scores = model.evaluate(valid_dataset, metric, transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)
+59 −0
Original line number Diff line number Diff line
"""
Script that trains multitask models on HOPV dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import numpy as np
import deepchem as dc
from hopv_datasets import load_hopv

# Only for debug!
np.random.seed(123)

# Load HOPV dataset
n_features = 1024
hopv_tasks, hopv_datasets, transformers = load_hopv()
train_dataset, valid_dataset, test_dataset = hopv_datasets

# Fit models
metric = [
    dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean, mode="regression"),
    dc.metrics.Metric(
        dc.metrics.mean_absolute_error, np.mean, mode="regression")
]

n_layers = 1
n_bypass_layers = 1
nb_epoch = 25
model = dc.models.RobustMultitaskRegressor(
    len(hopv_tasks),
    train_dataset.get_data_shape()[0],
    layer_sizes=[500] * n_layers,
    bypass_layer_sizes=[50] * n_bypass_layers,
    dropouts=[.25] * n_layers,
    bypass_dropouts=[.25] * n_bypass_layers,
    weight_init_stddevs=[.02] * n_layers,
    bias_init_consts=[.5] * n_layers,
    bypass_weight_init_stddevs=[.02] * n_bypass_layers,
    bypass_bias_init_consts=[.5] * n_bypass_layers,
    learning_rate=.0003,
    penalty=.0001,
    penalty_type="l2",
    optimizer="adam",
    batch_size=100)

# Fit trained model
model.fit(train_dataset, nb_epoch=nb_epoch)
model.save()

print("Evaluating model")
train_scores = model.evaluate(train_dataset, metric, transformers)
valid_scores = model.evaluate(valid_dataset, metric, transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)
+48 −0
Original line number Diff line number Diff line
"""
Script that trains sklearn models on HOPV dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import numpy as np
import deepchem as dc
from hopv_datasets import load_hopv
from sklearn.ensemble import RandomForestRegressor

# Only for debug!
np.random.seed(123)

# Load HOPV dataset
hopv_tasks, hopv_datasets, transformers = load_hopv()
(train_dataset, valid_dataset, test_dataset) = hopv_datasets

# Fit models
metric = [
    dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean, mode="regression"),
    dc.metrics.Metric(
        dc.metrics.mean_absolute_error, np.mean, mode="regression")
]


def model_builder(model_dir):
  sklearn_model = RandomForestRegressor(n_estimators=500)
  return dc.models.SklearnModel(sklearn_model, model_dir)


model = dc.models.SingletaskToMultitask(hopv_tasks, model_builder)

# Fit trained model
print("About to fit model")
model.fit(train_dataset)
model.save()

print("About to evaluate model")
train_scores = model.evaluate(train_dataset, metric, transformers)
valid_scores = model.evaluate(valid_dataset, metric, transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)
Loading