Commit 1aafd057 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #218 from flee2/master

LogTransformer
parents 494cce1f 47f17ca4
Loading
Loading
Loading
Loading
+18 −0
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ import numpy as np
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.featurizers import UserDefinedFeaturizer
from deepchem.transformers import NormalizationTransformer
from deepchem.models.tests import TestAPI

@@ -89,3 +90,20 @@ class TestDatasetAPI(TestAPI):
        featurizer=featurizer,
        verbosity="low")
    return loader.featurize(input_file, self.data_dir)
  
  def load_feat_multitask_data(self):
    """Load example with numerical features, tasks."""
    if os.path.exists(self.data_dir):
      shutil.rmtree(self.data_dir)
    features = ["feat0", "feat1", "feat2", "feat3", "feat4", "feat5"]
    featurizer = UserDefinedFeaturizer(features)
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5"]
    input_file = os.path.join(
        self.current_dir, "../../models/tests/feat_multitask_example.csv")
    loader = DataLoader(
        tasks=tasks,
        featurizer=featurizer,
        id_field="id",
        verbosity="low")
    return loader.featurize(input_file, self.data_dir)
+21 −0
Original line number Diff line number Diff line
id,feat0,feat1,feat2,feat3,feat4,feat5,task0,task1,task2,task3,task4,task5
a,46.95782353,2281.647059,151.7058824,4617.882353,1.9,26.64922258,11.21868066,-0.090215861,-0.365766551,35.77,144.8,37.9
b,47.88242857,2319.142857,152.1428571,4735.428571,1.9,26.78909726,8.324593175,-0.079948474,-0.474354584,35.07,145.5,37.4
c,138.6181145,3723.5363,206.33226,6203.928571,1.9,37.1436695,8.578717902,-0.268764633,-0.361584692,35.3,146.9,30.38
d,138.7386005,3713.185012,206.0336651,6257.974239,1.9,36.87034129,11.23065221,-0.438421012,-0.320220121,34.4,144.4,30.7
e,138.8590865,3702.833724,205.7350703,6312.019906,1.9,36.59701308,8.416282679,-0.270811911,-0.348160171,34.3,146.5,31.4
f,138.9795725,3692.482436,205.4364754,6366.065574,1.9,36.32368486,8.669120274,-1.282007735,-0.265793715,32.9,147.2,30.4
g,139.1000585,3682.131148,205.1378806,6420.111241,1.9,36.05035665,9.112035358,-4.030753981,-0.668500332,31.79,145.6,31.3
h,139.1962977,3686.540266,205.3578968,6401.200156,1.9,36.1772358,9.6052444,-0.631338884,-0.506531256,114.3,274.1,31.8
i,139.4615165,3651.077283,204.242096,6582.248244,1.9,35.23037201,12.67692068,-0.676249992,-0.613614489,103.2,283.3,31.1
j,73.87012602,3814.417326,152.1845065,7250.374011,1.9,18.02326885,11.77976823,-1.057947135,-0.626014954,121.4,294.8,85.7
k,73.89791191,3814.388588,152.1845065,7250.444815,1.96,18.02946371,7.22752611,-0.346213407,-0.35855121,123.7,278.7,92.9
l,77.32905831,3939.72475,153.463929,7709.933407,1.9,17.54688431,11.59280813,-0.680834029,-0.326826596,116.4,286,99.1
m,77.3553563,3940.081125,153.4799088,7711.70732,1.9,17.54890124,8.974658029,-0.871722799,-3.760401578,113.8,282.4,90.6
n,77.34120898,3940.001738,153.4717637,7711.142485,1.9,17.54711417,6.55383368,-3.624110554,-2.171964228,113.7,274.6,93.7
o,77.3528481,3940.267064,153.4792686,7712.300659,1.9,17.54733436,9.53024048,-0.230776362,-0.558358676,167.9,479.83,92.4
p,77.3640073,3940.521449,153.486464,7713.411079,1.9,17.54754547,8.115302888,-0.590017824,-0.260407039,166.6,462.75,88.7
q,52.87515522,3494.684377,125.6443977,6983.617044,2.55,11.86665765,8.584149314,-1.046291059,-0.560093204,167.2,474.84,204.4
r,52.90128948,3494.917412,125.6591268,6987.330878,2.55,11.86815323,9.450506663,-0.245156156,-0.559851977,170.7,463.23,199.2
s,52.88928649,3494.926546,125.6539482,6985.271577,2.55,11.86787173,8.09698666,-1.058468966,-0.3532891,142.2,459.28,205.1
t,52.92972473,3494.779433,125.6641502,6987.208325,2.55,11.87241024,8.279762212,-0.943225009,-0.415862327,172.7,475.4,206.7
+55 −5
Original line number Diff line number Diff line
@@ -92,8 +92,8 @@ class Transformer(object):
def _transform_row(i, df, transformer, data_dir):
  """
  Transforms the data (X, y, w,...) in a single row.
  Writes X-transformed, y-transformed to disk.

  Writes X-transforme,d y-transformed to disk.
  """
  transformer.transform_row(i, df, data_dir)

@@ -300,22 +300,72 @@ class ClippingTransformer(Transformer):

class LogTransformer(Transformer):

  def __init__(self, transform_X=False, transform_y=False,
               features=None, tasks=None,
               dataset=None):
    self.features=features
    self.tasks=tasks
    """Initialize log  transformation."""
    super(LogTransformer, self).__init__(
        transform_X=transform_X, transform_y=transform_y,
        dataset=dataset)

  def transform_row(self, i, df, data_dir):
    """Logarithmically transforms data in dataset."""
    """Select features and tasks of interest for transformation."""
    row = df.iloc[i]
    if self.transform_X:
      X = load_from_disk(os.path.join(data_dir, row['X-transformed']))
      num_features=len(X[0])
      if self.features is None:
        X = np.log(X+1)
      else:
        for j in xrange(num_features):
          if j in self.features:
            X[:,j] = np.log(X[:,j]+1)
          else:
            X[:,j] = X[:,j]
      save_to_disk(X, os.path.join(data_dir, row['X-transformed']))

    if self.transform_y:
      y = load_from_disk(os.path.join(data_dir, row['y-transformed']))
      num_tasks=len(y[0])
      if self.tasks is None:
        y = np.log(y+1)
      else:
        for j in xrange(num_tasks):
          if j in self.tasks:
            y[:,j] = np.log(y[:,j]+1)
          else:
            y[:,j] = y[:,j]
      save_to_disk(y, os.path.join(data_dir, row['y-transformed']))

  def untransform(self, z):
    """Undoes the logarithmic transformation."""
    """
    Undo transformation on provided data.
    """
    if self.transform_X:
      num_features=len(z[0])
      if self.features is None:
        return np.exp(z)-1
      else:
        for j in xrange(num_features):
          if j in self.features:
            z[:,j] = np.exp(z[:,j])-1
          else:
            z[:,j] = z[:,j]
        return z
    elif self.transform_y:
      num_tasks=len(z[0])
      if self.tasks is None:
        return np.exp(z)-1
      else:
        for j in xrange(num_tasks):
          if j in self.tasks:
            z[:,j] = np.exp(z[:,j])-1
          else:
            z[:,j] = z[:,j]
        return z

class BalancingTransformer(Transformer):
  """Balance positive and negative examples for weights."""
+67 −3
Original line number Diff line number Diff line
@@ -11,15 +11,15 @@ __license__ = "GPL"

import unittest
import numpy as np
import pandas as pd
import os
from deepchem.transformers import LogTransformer
from deepchem.transformers import NormalizationTransformer
from deepchem.transformers import BalancingTransformer
from deepchem.datasets.tests import TestDatasetAPI

class TestTransformerAPI(TestDatasetAPI):
  """
  Test top-level API for transformer objects.
  """
  """Test top-level API for transformer objects."""

  def test_y_log_transformer(self):
    """Tests logarithmic data transformer."""
@@ -65,6 +65,70 @@ class TestTransformerAPI(TestDatasetAPI):
    # Check that untransform does the right thing.
    np.testing.assert_allclose(log_transformer.untransform(X_t), X)
 
  def test_y_log_transformer_select(self):
    """Tests logarithmic data transformer with selection."""
    multitask_dataset = self.load_feat_multitask_data()
    dfe = pd.read_csv(os.path.join(self.current_dir,
                      "../../models/tests/feat_multitask_example.csv"))
    tid = []
    tasklist =  ["task0", "task3", "task4", "task5"]
    first_task = "task0"
    for task in tasklist:
      tiid = dfe.columns.get_loc(task)-dfe.columns.get_loc(first_task)
      tid = np.concatenate((tid, np.array([tiid])))
    tasks = tid.astype(int)
    log_transformer = LogTransformer(
        transform_y=True, tasks=tasks,
        dataset=multitask_dataset)
    X, y, w, ids = multitask_dataset.to_numpy()
    log_transformer.transform(multitask_dataset)
    X_t, y_t, w_t, ids_t = multitask_dataset.to_numpy()

    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is a y transformer
    np.testing.assert_allclose(X, X_t)
    # Check w is unchanged since this is a y transformer
    np.testing.assert_allclose(w, w_t)
    # Check y is now a logarithmic version of itself
    np.testing.assert_allclose(y_t[:,tasks], np.log(y[:,tasks]+1))

    # Check that untransform does the right thing.
    np.testing.assert_allclose(log_transformer.untransform(y_t), y)

  def test_X_log_transformer_select(self):
    #Tests logarithmic data transformer with selection.
    multitask_dataset = self.load_feat_multitask_data()
    dfe = pd.read_csv(os.path.join(self.current_dir,
                      "../../models/tests/feat_multitask_example.csv"))
    fid = []
    featurelist =  ["feat0", "feat1", "feat2","feat3", "feat5"]
    first_feature = "feat0"
    for feature in featurelist:
      fiid = dfe.columns.get_loc(feature)-dfe.columns.get_loc(first_feature)
      fid = np.concatenate((fid, np.array([fiid])))
    features = fid.astype(int)
    log_transformer = LogTransformer(
        transform_X=True, features=features,
        dataset=multitask_dataset)
    X, y, w, ids = multitask_dataset.to_numpy()
    log_transformer.transform(multitask_dataset)
    X_t, y_t, w_t, ids_t = multitask_dataset.to_numpy()

    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check y is unchanged since this is a X transformer
    np.testing.assert_allclose(y, y_t)
    # Check w is unchanged since this is a y transformer
    np.testing.assert_allclose(w, w_t)
    # Check y is now a logarithmic version of itself
    np.testing.assert_allclose(X_t[:,features], np.log(X[:,features]+1))

    # Check that untransform does the right thing.
    np.testing.assert_allclose(log_transformer.untransform(X_t), X)

  def test_y_normalization_transformer(self):
    """Tests normalization transformer."""
    solubility_dataset = self.load_solubility_data()