Commit b9efcd34 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Pause before design review on avoiding feature-array dup in singletask.

parent 0cfd0d4f
Loading
Loading
Loading
Loading
+1 −17
Original line number Diff line number Diff line
@@ -58,8 +58,8 @@ def process_datasets(paths, input_transforms, output_transforms,
    train_dict["all"], test_dict["all"] = train_data, test_data
  else:
    raise ValueError("Unsupported mode for process_datasets.")
  print "Shape of Xtrain"
  target = train_dict.itervalues().next()
  print "Shape of Xtrain"
  print np.shape(target[1])
  return train_dict, test_dict 

@@ -103,22 +103,6 @@ def load_molecules(paths, feature_types=["fingerprints"]):
              entry["feature_types"].append(feature_type)
  return molecules 

#def get_target_names(paths, target_dir_name="targets"):
#  """Get names of targets in provided collections.
#
#  Parameters
#  ----------
#  paths: list 
#    List of paths to base directory.
#  """
#  target_names = []
#  for dataset_path in paths:
#    target_dir = os.path.join(dataset_path, target_dir_name)
#    target_names += [target_pickle.split(".")[0]
#        for target_pickle in os.listdir(target_dir)
#        if "pkl.gz" in target_pickle]
#  return target_names

def load_assays(paths, target_dir_name, target_names):
  """Load regression dataset labels from assays.

+0 −4
Original line number Diff line number Diff line
@@ -176,8 +176,6 @@ def dataset_to_numpy(dataset, weight_positives=False):
    fingerprint, labels = (datapoint["fingerprint"],
      datapoint["labels"])
    tensors.append(np.squeeze(fingerprint))
    #fingerprint.reshape(np.shape(X[id_ind]))
    #X[id_ind] = fingerprint
    sorted_targets = sorted(labels.keys())
    # Set labels from measurements
    for target_ind, target in enumerate(sorted_targets):
@@ -186,11 +184,9 @@ def dataset_to_numpy(dataset, weight_positives=False):
        W[id_ind][target_ind] = 0
      else:
        y[id_ind][target_ind] = labels[target]

  X = np.stack(tensors)
  if weight_positives:
    W = balance_positives(y, W)
  print("Done filling X")
  return (sorted_ids, X, y, W)

def multitask_to_singletask(dataset):