Commit ef843889 authored by Bharath's avatar Bharath
Browse files

Some debugging progress

parent fc7d9850
Loading
Loading
Loading
Loading
+12 −2
Original line number Diff line number Diff line
@@ -636,12 +636,17 @@ def convert_df_to_numpy(df, feature_type, tasks, mol_id_field):
    for task in range(n_tasks):
      if y[ind, task] == "":
        missing[ind, task] = 1
  x = np.squeeze(np.array(list(df[feature_type].values)))
  x_list = list(df[feature_type].values)
  valid_inds = np.array([1 if elt.size > 0 else 0 for elt in x_list], dtype=bool)
  x_list = [elt for (is_valid, elt) in zip(valid_inds, x_list) if is_valid]
  x = np.squeeze(np.array(x_list))
  ############################################################## DEBUG
  time2 = time.time()
  print("CONVERT_DF_TO_NUMPY X COMP TOOK %0.3f s" % (time2-time1))
  ############################################################## DEBUG
  sorted_ids = df[mol_id_field]
  ############################################################## DEBUG
  sorted_ids = df[mol_id_field].values
  ############################################################## DEBUG

  # Set missing data to have weight zero
  # TODO(rbharath): There's a better way to do this with numpy indexing
@@ -658,6 +663,11 @@ def convert_df_to_numpy(df, feature_type, tasks, mol_id_field):
  print("CONVERT_DF_TO_NUMPY MISSING COMP TOOK %0.3f s" % (time2-time1))
  ############################################################## DEBUG

  ############################################################## DEBUG
  sorted_ids = sorted_ids[valid_inds]
  y = y[valid_inds]
  w = w[valid_inds]
  ############################################################## DEBUG
  # Adding this assertion in to avoid ill-formed outputs.
  assert len(sorted_ids) == len(x) == len(y) == len(w)
  return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
+5 −5
Original line number Diff line number Diff line
@@ -45,17 +45,17 @@ class TestDatasetAPI(TestAPI):
    """Loads classification data from example.csv"""
    if os.path.exists(self.data_dir):
      shutil.rmtree(self.data_dir)
    featurizer = [CircularFingerprint(size=1024)]
    featurizer = CircularFingerprint(size=1024)
    tasks = ["outcome"]
    task_type = "classification"
    input_file = os.path.join(
        self.current_dir, "../../models/tests/example_classification.csv")
    featurizer = DataFeaturizer(
    loader = DataFeaturizer(
        tasks=tasks,
        smiles_field=self.smiles_field,
        featurizer=featurizer,
        verbosity="low")
    return featurizer.featurize(input_file, self.data_dir)
    return loader.featurize(input_file, self.data_dir)

  def load_multitask_data(self):
    """Load example multitask data."""
@@ -67,9 +67,9 @@ class TestDatasetAPI(TestAPI):
             "task13", "task14", "task15", "task16"]
    input_file = os.path.join(
        self.current_dir, "../../models/tests/multitask_example.csv")
    featurizer = DataFeaturizer(
    loader = DataFeaturizer(
        tasks=tasks,
        smiles_field=self.smiles_field,
        featurizer=featurizer,
        verbosity="low")
    return featurizer.featurize(input_file, self.data_dir)
    return loader.featurize(input_file, self.data_dir)
+5 −5
Original line number Diff line number Diff line
@@ -37,11 +37,11 @@ class TestDrop(TestAPI):
    featurizer = CircularFingerprint(size=1024)
    emols_tasks = ['activity']

    featurizer = DataFeaturizer(tasks=emols_tasks,
    loader = DataFeaturizer(tasks=emols_tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
    dataset = featurizer.featurize(dataset_file, data_dir)
    dataset = loader.featurize(dataset_file, data_dir, debug=True, logging=False)

    X, y, w, ids = dataset.to_numpy()
    print("ids.shape, X.shape, y.shape, w.shape")
+57 −5
Original line number Diff line number Diff line
@@ -16,6 +16,8 @@ import multiprocessing as mp
from functools import partial
from rdkit import Chem
import itertools as it
import traceback
from multiprocessing.pool import Pool
from deepchem.utils.save import log
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import load_pickle_from_disk
@@ -26,11 +28,48 @@ from deepchem.utils.save import load_data
from deepchem.utils.save import get_input_type
############################################################## DEBUG
import time
import sys
############################################################## DEBUG

#def _process_helper(row, loader, fields, input_type):
#  return loader._process_raw_sample(input_type, row, fields)


# Shortcut to multiprocessing's logger
# http://stackoverflow.com/questions/6728236/exception-thrown-in-multiprocessing-pool-not-detected
def error(msg, *args):
  ############################################################# DEBUG
  import sys
  sys.stdout.flush()
  ############################################################# DEBUG
  return mp.get_logger().error(msg, *args)

class LogExceptions(object):
  def __init__(self, callable):
    self.__callable = callable

  def __call__(self, *args, **kwargs):
    try:
        result = self.__callable(*args, **kwargs)

    except Exception as e:
        # Here we add some debugging help. If multiprocessing's
        # debugging is on, it will arrange to log the traceback
        error(traceback.format_exc())
        # Re-raise the original exception so the Pool worker can
        # clean up
        raise

    # It was fine, give a normal answer
    return result

class LoggingPool(Pool):
  def apply_async(self, func, args=(), kwds={}, callback=None):
    return Pool.apply_async(self, LogExceptions(func), args, kwds, callback)

  def map_async(self, func, iterable, chunksize=None, callback=None):
    return Pool.map_async(self, LogExceptions(func), iterable, chunksize, callback)

def featurize_map_function(args):
  #try:
  ############################################################## DEBUG
@@ -117,7 +156,8 @@ class DataFeaturizer(object):
    self.log_every_n = log_every_n

  def featurize(self, input_files, data_dir, shard_size=8192,
                num_shards_per_batch=24, worker_pool=None):
                num_shards_per_batch=24, worker_pool=None,
                logging=True, debug=False):
    """Featurize provided files and write to specified location."""
    ############################################################## DEBUG
    time1 = time.time()
@@ -138,8 +178,15 @@ class DataFeaturizer(object):
      return None
    input_type = get_input_type(input_files[0])

    if logging:
      mp.log_to_stderr()
    if worker_pool is None:
      ############################################################## DEBUG
      if logging:
        worker_pool = LoggingPool(processes=1)
      else:
        worker_pool = mp.Pool(processes=1)
      ############################################################## DEBUG
    log("Spawning workers now.", self.verbosity)
    metadata_rows = []
    data_iterator = it.izip(
@@ -159,9 +206,14 @@ class DataFeaturizer(object):
      ############################################################## DEBUG
      time1 = time.time()
      ############################################################## DEBUG
      iterator = itertools.islice(data_iterator, num_shards_per_batch)
      if not debug:
        batch_metadata = worker_pool.map(
          featurize_map_function,
          itertools.islice(data_iterator, num_shards_per_batch))
            featurize_map_function, iterator)
      else:
        batch_metadata = []
        for elt in iterator:
          batch_metadata.append(featurize_map_function(elt))
      ############################################################## DEBUG
      time2 = time.time()
      print("MAP CALL TOOK %0.3f s" % (time2-time1))
+5 −5
Original line number Diff line number Diff line
@@ -29,10 +29,10 @@ class TestDataFeaturizer(TestAPI):

    tasks = ["log-solubility"]
    smiles_field = "smiles"
    featurizer = DataFeaturizer(tasks=tasks,
    loader = DataFeaturizer(tasks=tasks,
                            smiles_field=self.smiles_field,
                                featurizers=[CircularFingerprint(size=1024)],
                            featurizer=CircularFingerprint(size=1024),
                            verbosity="low")
    dataset = featurizer.featurize(input_file, self.data_dir)
    dataset = loader.featurize(input_file, self.data_dir)
    
    assert len(dataset) == 10
Loading