Commit 659afe51 authored by evanfeinberg's avatar evanfeinberg
Browse files

Fixes to integration tests.

parent e675b5c6
Loading
Loading
Loading
Loading
+0 −10
Original line number Original line Diff line number Diff line
@@ -93,20 +93,10 @@ class Evaluator(object):


    for i, task_name in enumerate(self.task_names):
    for i, task_name in enumerate(self.task_names):
      y = pred_y_df[task_name].values
      y = pred_y_df[task_name].values
      print("y")
      print(y)
      y_pred = pred_y_df["%s_pred" % task_name].values
      y_pred = pred_y_df["%s_pred" % task_name].values
      print("y_pred")
      print(y_pred)
      w = pred_y_df["%s_weight" % task_name].values
      w = pred_y_df["%s_weight" % task_name].values
      y = undo_transform(y, y_means, y_stds, self.output_transforms)
      y = undo_transform(y, y_means, y_stds, self.output_transforms)
      y_pred = undo_transform(y_pred, y_means, y_stds, self.output_transforms)
      y_pred = undo_transform(y_pred, y_means, y_stds, self.output_transforms)
      print("means, stds")
      print(y_means)
      print(y_stds)
      print("untransformed y, y_pred")
      print(y)
      print(y_pred)


      if self.task_type == "classification":
      if self.task_type == "classification":
        y, y_pred = y[w.nonzero()].astype(int), y_pred[w.nonzero()].astype(int)
        y, y_pred = y[w.nonzero()].astype(int), y_pred[w.nonzero()].astype(int)
+44 −44
Original line number Original line Diff line number Diff line
@@ -14,10 +14,12 @@ from vs_utils.features.fingerprints import CircularFingerprint
from vs_utils.features.basic import SimpleDescriptors
from vs_utils.features.basic import SimpleDescriptors
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import load_pickle_from_disk
from deepchem.utils.save import load_pandas_from_disk
from vs_utils.utils import ScaffoldGenerator
from vs_utils.utils import ScaffoldGenerator
from vs_utils.features.nnscore import NNScoreComplexFeaturizer
from vs_utils.features.nnscore import NNScoreComplexFeaturizer
import multiprocessing as mp
import multiprocessing as mp
from functools import partial



def generate_scaffold(smiles, include_chirality=False):
def generate_scaffold(smiles, include_chirality=False):
  """Compute the Bemis-Murcko scaffold for a SMILES string."""
  """Compute the Bemis-Murcko scaffold for a SMILES string."""
@@ -57,8 +59,6 @@ def _get_input_type(input_file):
    return "pandas-pickle"
    return "pandas-pickle"
  elif file_extension == ".joblib":
  elif file_extension == ".joblib":
    return "pandas-joblib"
    return "pandas-joblib"
  elif file_extension == ".sdf":
    return "sdf"
  else:
  else:
    raise ValueError("Unrecognized extension %s" % file_extension)
    raise ValueError("Unrecognized extension %s" % file_extension)


@@ -75,14 +75,11 @@ def _get_fields(input_file):
  elif input_type == "pandas-pickle":
  elif input_type == "pandas-pickle":
    df = load_pickle_from_disk(input_file)
    df = load_pickle_from_disk(input_file)
    return df.keys()
    return df.keys()
  elif input_type == "sdf":
    sample_mol = _get_raw_samples(input_file).next()
    return list(sample_mol.GetPropNames())
  else:
  else:
    raise ValueError("Unrecognized extension for %s" % input_file)
    raise ValueError("Unrecognized extension for %s" % input_file)



'''
def _get_raw_samples(input_file):
def _get_raw_samples(input_file, iterator=True):
  """Returns an iterator over all rows in input_file"""
  """Returns an iterator over all rows in input_file"""
  input_type = _get_input_type(input_file)
  input_type = _get_input_type(input_file)
  if input_type == "csv":
  if input_type == "csv":
@@ -101,19 +98,9 @@ def _get_raw_samples(input_file):
    dataframe = load_pickle_from_disk(input_file)
    dataframe = load_pickle_from_disk(input_file)
    for _, row in dataframe.iterrows():
    for _, row in dataframe.iterrows():
      yield row   
      yield row   
  elif input_type == "sdf":
    if ".gz" in input_file:
      with gzip.open(input_file) as inp_file_obj:
        supp = Chem.ForwardSDMolSupplier(inp_file_obj)
        for mol in supp:
          if mol is not None:
            yield mol
  else:
  else:
      with open(input_file) as inp_file_obj:
    raise ValueError("Unrecognized input type for %s" % input_file)
        supp = Chem.ForwardSDMolSupplier(inp_file_obj)
'''
        for mol in supp:
          if mol is not None:
            yield mol


class DataFeaturizer(object):
class DataFeaturizer(object):
  """
  """
@@ -146,21 +133,40 @@ class DataFeaturizer(object):
    self.verbose = verbose
    self.verbose = verbose
    self.log_every_n = log_every_n
    self.log_every_n = log_every_n


  def featurize(self, input_file, feature_types, out):
  def featurize(self, input_file, feature_types, feature_dir, shard_size=128):
    """Featurize provided file and write to specified location."""
    """Featurize provided file and write to specified location."""
    fields = _get_fields(input_file)
    input_type = _get_input_type(input_file)
    input_type = _get_input_type(input_file)


    rows = []
    print("Loading raw samples now.")
    for ind, row in enumerate(_get_raw_samples(input_file)):
    raw_df = load_pandas_from_disk(input_file)
      if ind % self.log_every_n == 0:
    fields = raw_df.keys()
        print("Loading sample %d" % ind)
    print("Loaded raw data frame from file.")
      rows.append(self._process_raw_sample(input_type, row, fields))
    def process_raw_sample_helper(row, fields, input_type):
    df = self._standardize_df(pd.DataFrame(rows))
      return self._process_raw_sample(input_type, row, fields)
    process_raw_sample_helper_partial = partial(process_raw_sample_helper, 
                                                fields=fields,
                                                input_type=input_type)

    processed_rows = raw_df.apply(process_raw_sample_helper_partial, axis=1)
    print("finished processing rows")
    raw_df = pd.DataFrame.from_records(processed_rows)

    nb_sample = raw_df.shape[0]
    interval_points = np.linspace(
        0, nb_sample, np.ceil(float(nb_sample)/shard_size)+1, dtype=int)
    shard_files = []
    for j in range(len(interval_points)-1):
      print("Sharding and standardizing into shard-%s / %s shards" % (str(j+1), len(interval_points)-1))
      raw_df_shard = raw_df.iloc[range(interval_points[j], interval_points[j+1])]
      df = self._standardize_df(raw_df_shard)
      for feature_type in feature_types:
      for feature_type in feature_types:
        print("Currently feauturizing feature_type: %s" % feature_type)
        print("Currently feauturizing feature_type: %s" % feature_type)
      self._featurize_df(df, rows, feature_type)
        self._featurize_df(df, feature_type)
    save_to_disk(df, out)

      shard_out = os.path.join(feature_dir, "features_shard%d.joblib" % j)
      save_to_disk(df, shard_out)
      shard_files.append(shard_out)
    return shard_files


  def _process_raw_sample(self, input_type, row, fields):
  def _process_raw_sample(self, input_type, row, fields):
    """Extract information from row data."""
    """Extract information from row data."""
@@ -172,14 +178,6 @@ class DataFeaturizer(object):
    elif input_type in ["pandas-pickle", "pandas-joblib"]:
    elif input_type in ["pandas-pickle", "pandas-joblib"]:
      for field in fields:
      for field in fields:
        data[field] = _process_field(row[field])
        data[field] = _process_field(row[field])
    elif input_type == "sdf":
      mol = row
      for field in fields:
        if not mol.HasProp(field):
          data[field] = None
        else:
          data[field] = _process_field(mol.GetProp(field))
      data["smiles"] = Chem.MolToSmiles(mol)
    else:
    else:
      raise ValueError("Unrecognized input_type")
      raise ValueError("Unrecognized input_type")
    if self.threshold is not None:
    if self.threshold is not None:
@@ -193,6 +191,8 @@ class DataFeaturizer(object):
  def _standardize_df(self, ori_df):
  def _standardize_df(self, ori_df):
    """Copy specified columns to new df with standard column names."""
    """Copy specified columns to new df with standard column names."""
    df = pd.DataFrame([])
    df = pd.DataFrame([])
    print("ori_df.keys()")
    print(ori_df.keys())
    df["mol_id"] = ori_df[[self.id_field]]
    df["mol_id"] = ori_df[[self.id_field]]
    df["smiles"] = ori_df[[self.smiles_field]]
    df["smiles"] = ori_df[[self.smiles_field]]
    for task in self.tasks:
    for task in self.tasks:
@@ -207,14 +207,14 @@ class DataFeaturizer(object):
      df["ligand_mol2"] = ori_df[[self.ligand_mol2_field]]
      df["ligand_mol2"] = ori_df[[self.ligand_mol2_field]]
    return df
    return df


  def _featurize_df(self, df, rows, feature_type):
  def _featurize_df(self, df, feature_type):
    """Generates circular fingerprints for dataset."""
    """Generates circular fingerprints for dataset."""
    if feature_type == "user-specified-features":
    if feature_type == "user-specified-features":
      if self.user_specified_features is not None:
      if self.user_specified_features is not None:
        if self.verbose:
        if self.verbose:
          print("Adding user-defined features.")
          print("Adding user-defined features.")
        features_data = []
        features_data = []
        for row in rows:
        for _, row in df.iterrows():
          # pandas rows are tuples (row_num, row_data)
          # pandas rows are tuples (row_num, row_data)
          feature_list = []
          feature_list = []
          for feature_name in self.user_specified_features:
          for feature_name in self.user_specified_features:
@@ -245,7 +245,7 @@ class DataFeaturizer(object):
      ligand_pdbs = list(df["ligand_pdb"])
      ligand_pdbs = list(df["ligand_pdb"])
      complexes = zip(ligand_pdbs, protein_pdbs)
      complexes = zip(ligand_pdbs, protein_pdbs)
      
      
      pool = mp.Pool(mp.cpu_count())
      pool = mp.Pool(processes=mp.cpu_count())
      features = pool.map(map_function, complexes)
      features = pool.map(map_function, complexes)
      pool.terminate()
      pool.terminate()


+14 −2
Original line number Original line Diff line number Diff line
@@ -10,6 +10,7 @@ from __future__ import unicode_literals
from sklearn.externals import joblib
from sklearn.externals import joblib
import gzip
import gzip
import cPickle as pickle
import cPickle as pickle
import pandas as pd


def save_to_disk(dataset, filename):
def save_to_disk(dataset, filename):
  """Save a dataset to file."""
  """Save a dataset to file."""
@@ -17,8 +18,10 @@ def save_to_disk(dataset, filename):


def load_from_disk(filename):
def load_from_disk(filename):
  """Load a dataset from file."""
  """Load a dataset from file."""
  dataset = joblib.load(filename)
  if ".pkl" in filename:
  return dataset
    return load_pickle_from_disk(filename)
  else:
    return joblib.load(filename)


def load_pickle_from_disk(filename):
def load_pickle_from_disk(filename):
  """Load dataset from pickle file."""
  """Load dataset from pickle file."""
@@ -29,3 +32,12 @@ def load_pickle_from_disk(filename):
    with open(filename, "rb") as f:
    with open(filename, "rb") as f:
      df = pickle.load(f)
      df = pickle.load(f)
  return df
  return df

def load_pandas_from_disk(filename):
  """Load data as pandas dataframe."""
  if ".csv" not in filename:
    return load_from_disk(filename)
  else:
    # First line of user-specified CSV *must* be header.
    df = pd.read_csv(filename, header=0)
    return df
 No newline at end of file
+4 −6
Original line number Original line Diff line number Diff line
@@ -50,11 +50,10 @@ class TestSingletaskVectorAPI(unittest.TestCase):
                                protein_pdb_field=protein_pdb_field,
                                protein_pdb_field=protein_pdb_field,
                                ligand_pdb_field=ligand_pdb_field,
                                ligand_pdb_field=ligand_pdb_field,
                                verbose=True)
                                verbose=True)
    feature_file = os.path.join(self.feature_dir, "out.joblib")
    feature_files = featurizer.featurize(input_file, feature_types, self.feature_dir)
    featurizer.featurize(input_file, feature_types, feature_file)


    # Transform data into arrays for ML
    # Transform data into arrays for ML
    samples = FeaturizedSamples(self.samplesdir, [feature_file],
    samples = FeaturizedSamples(self.samplesdir, feature_files,
                                reload_data=False)
                                reload_data=False)


    # Split into train/test
    # Split into train/test
@@ -180,11 +179,10 @@ class TestMultitaskVectorAPI(unittest.TestCase):
    featurizer = DataFeaturizer(tasks=self.tasks,
    featurizer = DataFeaturizer(tasks=self.tasks,
                                smiles_field=self.smiles_field,
                                smiles_field=self.smiles_field,
                                verbose=True)
                                verbose=True)
    feature_file = os.path.join(self.feature_dir, "out.joblib")
    feature_files = featurizer.featurize(self.input_file, feature_types, self.feature_dir)
    featurizer.featurize(self.input_file, feature_types, feature_file)


    # Transform data into arrays for ML
    # Transform data into arrays for ML
    samples = FeaturizedSamples(self.samplesdir, [feature_file],
    samples = FeaturizedSamples(self.samplesdir, feature_files,
                                reload_data=False)
                                reload_data=False)


    # Split into train/test
    # Split into train/test