Commit 6d910d96 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Merge pull request #13 from evanfeinberg/master

Smoothed featurization and data processing procedure
parents f01ed589 50fbddcc
Loading
Loading
Loading
Loading

.gitignore

0 → 100644
+59 −0
Original line number Diff line number Diff line
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log

# Sphinx documentation
docs/_build/

# PyBuilder
target/
+0 −1
Original line number Diff line number Diff line
@@ -26,7 +26,6 @@ def parse_args(input_args=None):
                      choices=["log", "normalize"],
                      help="Transforms to apply to output data.")
  parser.add_argument("--feature-types", nargs="+", required=1,
                      choices=["fingerprints", "descriptors", "grid"],
                      help="Types of featurizations to use.")
  parser.add_argument("--paths", nargs="+", required=1,
                      help="Paths to input datasets.")
+75 −30
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ import argparse
import csv
from rdkit import Chem
import subprocess
from vs_utils.utils import SmilesGenerator
from vs_utils.utils import SmilesGenerator, ScaffoldGenerator

def parse_args(input_args=None):
  """Parse command-line arguments."""
@@ -32,15 +32,21 @@ def parse_args(input_args=None):
                      help="Name of the dataset.")
  parser.add_argument("--out", required=1,
                      help="Folder to generate processed dataset in.")
  parser.add_argument("--feature-endpoint", type=str,
                      help="Optional endpoint that holds pre-computed feature vector")
  parser.add_argument("--prediction-endpoint", type=str, required=1,
                      help="Name of measured endpoint to predict.")
  parser.add_argument("--threshold", type=float, default=None,
                      help="Used to turn real-valued data into binary.")
  parser.add_argument("--delimiter", default="\t",
                      help="Delimiter in csv file")
  parser.add_argument("--has-colnames", type=bool, default=False,
                      help="Input has column names.")
  parser.add_argument("--split-endpoint", type=str, default=None,
                      help="User-specified train-test split.")
  return parser.parse_args(input_args)

def generate_directories(name, out):
def generate_directories(name, out, feature_endpoint):
  """Generate processed dataset."""
  dataset_dir = os.path.join(out, name)
  if not os.path.exists(dataset_dir):
@@ -57,11 +63,16 @@ def generate_directories(name, out):
  shards_dir = os.path.join(dataset_dir, "shards")
  if not os.path.exists(shards_dir):
    os.makedirs(shards_dir)
  if feature_endpoint is not None:
    feature_endpoint_dir = os.path.join(dataset_dir, feature_endpoint)
    if not os.path.exists(feature_endpoint_dir):
      os.makedirs(feature_endpoint_dir)

  # Return names of files to be generated
  out_pkl = os.path.join(target_dir, "%s.pkl.gz" % name)
  out_y_pkl = os.path.join(target_dir, "%s.pkl.gz" % name)
  out_sdf = os.path.join(shards_dir, "%s-0.sdf.gz" % name)
  return out_pkl, out_sdf
  out_x_pkl = os.path.join(feature_endpoint_dir, "%s.pkl.gz" %name) if feature_endpoint is not None else None
  return out_x_pkl, out_y_pkl, out_sdf

def parse_float_input(val):
  """Safely parses a float input."""
@@ -167,20 +178,63 @@ def process_field(data, field_type):
  elif field_type == "float":
    return parse_float_input(data)
  elif field_type == "list-string":
    if type(data) == list:
      return data
    else:
      return data.split(",")
  elif field_type == "list-float":
    return np.array(data.split(","))
  elif field_type == "ndarray":
    return data 

def generate_targets(input_file, input_type, fields, field_types, out_pkl,
    out_sdf, prediction_endpoint, threshold, delimiter):
  """Process input data file."""
def generate_targets(df, mols, prediction_endpoint, split_endpoint, out_pkl, out_sdf):
  """Process input data file, generate labels, i.e. y"""
  #TODO(enf, rbharath): Modify package unique identifier to take user-specified 
    #unique identifier instead of assuming smiles string
  if split_endpoint is not None:
    labels_df = df[["smiles", prediction_endpoint, split_endpoint]]
  else:
    labels_df = df[["smiles", prediction_endpoint]]

  # Write pkl.gz file
  with gzip.open(out_pkl, "wb") as f:
    pickle.dump(labels_df, f, pickle.HIGHEST_PROTOCOL)
  # Write sdf.gz file
  with gzip.open(out_sdf, "wb") as gz:
    w = Chem.SDWriter(gz)
    for mol in mols:
      w.write(mol)
    w.close()

def generate_scaffold(smiles_elt, include_chirality=False):
  smiles_string = smiles_elt["smiles"]
  mol = Chem.MolFromSmiles(smiles_string)
  engine = ScaffoldGenerator(include_chirality=include_chirality)
  scaffold = engine.get_scaffold(mol)
  return(scaffold)

def generate_features(df, feature_endpoint, out_pkl):
  if feature_endpoint is None:
    print("No feature endpoint specified by user.")
    return

  features_df = df[["smiles"]]
  features_df["features"] = df[[feature_endpoint]]
  features_df["scaffolds"] = df[["smiles"]].apply(generate_scaffold, axis=1)
  features_df["mol_id"] = df[["smiles"]].apply(lambda s : "", axis=1)

  with gzip.open(out_pkl, "wb") as f:
    pickle.dump(features_df, f, pickle.HIGHEST_PROTOCOL)

def extract_data(input_file, input_type, fields, field_types, 
      prediction_endpoint, threshold, delimiter, has_colnames):
  """Extracts data from input as Pandas data frame"""

  rows, mols, smiles = [], [], SmilesGenerator()
  for row_index, raw_row in enumerate(get_rows(input_file, input_type, delimiter)):
    print row_index
    # Skip row labels.
    if row_index == 0 or raw_row is None:  
    # Skip row labels if necessary.
    if has_colnames and (row_index == 0 or raw_row is None):  
      continue
    row, row_data = {}, get_row_data(raw_row, input_type, fields, field_types)
    for ind, (field, field_type) in enumerate(zip(fields, field_types)):
@@ -189,35 +243,26 @@ def generate_targets(input_file, input_type, fields, field_types, out_pkl,
        row[field] = 1 if raw_val > threshold else 0 
      else:
        row[field] = process_field(row_data[ind], field_type)
    # TODO(rbharath): This patch is only in place until the smiles/sequence
    # support is fixed.
    if row["smiles"] is None:
      # This multiplication kludge guarantees unique smiles.
      mol = Chem.MolFromSmiles("C"*row_index)
    else:
    
    mol = Chem.MolFromSmiles(row["smiles"])
    row["smiles"] = smiles.get_smiles(mol)
    mols.append(mol)
    rows.append(row)
  df = pd.DataFrame(rows)
  # Write pkl.gz file
  with gzip.open(out_pkl, "wb") as f:
    pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)
  # Write sdf.gz file
  with gzip.open(out_sdf, "wb") as gz:
    w = Chem.SDWriter(gz)
    for mol in mols:
      w.write(mol)
    w.close()
  return(df, mols)


def main():
  args = parse_args()
  if len(args.fields) != len(args.field_types):
    raise ValueError("number of fields does not equal number of field types")
  out_pkl, out_sdf = generate_directories(args.name, args.out)
  generate_targets(args.input_file, args.input_type, args.fields,
      args.field_types, out_pkl, out_sdf, args.prediction_endpoint,
      args.threshold, args.delimiter)
  out_x_pkl, out_y_pkl, out_sdf = generate_directories(args.name, args.out, 
      args.feature_endpoint)
  df, mols = extract_data(args.input_file, args.input_type, args.fields,
      args.field_types, args.prediction_endpoint,
      args.threshold, args.delimiter, args.has_colnames)
  generate_targets(df, mols, args.prediction_endpoint, args.split_endpoint, out_y_pkl, out_sdf)
  generate_features(df, args.feature_endpoint, out_x_pkl)
  generate_fingerprints(args.name, args.out)
  generate_descriptors(args.name, args.out)

+1 −49
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@ from deep_chem.utils.preprocess import tensor_dataset_to_numpy
from deep_chem.utils.preprocess import multitask_to_singletask
from deep_chem.utils.preprocess import split_dataset
from deep_chem.utils.preprocess import to_arrays
from vs_utils.utils import ScaffoldGenerator

def process_datasets(paths, input_transforms, output_transforms,
    prediction_endpoint=None, split_endpoint=None, datatype="vector",
@@ -107,29 +108,6 @@ def load_molecules(paths, feature_types=["fingerprints"]):
              entry["feature_types"].append(feature_type)
  return molecules 

def load_pdbbind_molecules(paths, dir_name="fingerprints"):
  """Load dataset fingerprints and return fingerprints.
  """
  # TODO(rbharath): This is a total kludge. Clean up later.
  dir_name = "targets"
  molecules = {}
  for dataset_path in paths:
    pickle_dir = os.path.join(dataset_path, dir_name)
    pickle_files = os.listdir(pickle_dir)
    if len(pickle_files) == 0:
      raise ValueError("No Pickle Files found to load molecules")
    for pickle_file in pickle_files:
      with gzip.open(os.path.join(pickle_dir, pickle_file), "rb") as f:
        contents = pickle.load(f)
        smiles, fingerprints, scaffolds, mol_ids = (
            contents["smiles"], contents["features"],
            None, None)
        for mol in range(len(contents["smiles"])):
          molecules[smiles[mol]] = {"fingerprint": fingerprints[mol],
                                    "scaffold": None,
                                    "mol_id": None}
  return molecules 

def get_target_names(paths, target_dir_name="targets"):
  """Get names of targets in provided collections.

@@ -207,32 +185,6 @@ def load_datasets(paths, prediction_endpoint, split_endpoint, datatype="vs",
  else:
    raise ValueError("Unsupported datatype.")

def load_pdbbind_datasets(paths, prediction_endpoint, target_dir_name="targets",
    feature_types=["grid"]):
  """Load pdbbind datasets.

  TODO(rbharath): This uses smiles as unique identifier. FIX BEFORE RELEASE!

  Parameters
  ----------
  pdbbind_path: list 
    List of Pdbbind data files.
  """
  data = {}
  if feature_types != ["grid"]:
    raise ValueError("Only grid features are supported for PDB-Bind data.")
  molecules = load_pdbbind_molecules(paths)
  labels, _ = load_assays(paths, prediction_endpoint, target_dir_name)
  # TODO(rbharath): Why are there fewer descriptors than labels at times?
  # What accounts for the descrepency. Please investigate.
  for ind, smiles in enumerate(molecules):
    if smiles not in labels:
      continue
    mol = molecules[smiles]
    data[ind] = {"fingerprint": mol["fingerprint"],
                 "scaffold": mol["scaffold"],
                 "labels": labels[smiles]}
  return data

def load_vs_datasets(paths, prediction_endpoint, split_endpoint, target_dir_name="targets",
    feature_types=["fingerprints"]):
+6 −6
Original line number Diff line number Diff line
@@ -164,9 +164,9 @@ def tensor_dataset_to_numpy(dataset, feature_endpoint="fingerprint",
    fingerprint, labels = (datapoint[feature_endpoint],
      datapoint[labels_endpoint])
    X[index] = fingerprint
    # TODO(rbharath): The label is a dict for some reason?!? Figure this out
    # and fix it.
    y[index] = labels["3d_core_pdbbind"]
    # TODO(rbharath): This is only specialized to single task.
    # need to generalize to handle multi-task
    y[index] = labels[labels.keys()[0]]
  return (X, y, W)

def dataset_to_numpy(dataset, feature_endpoint="fingerprint",
@@ -190,7 +190,7 @@ def dataset_to_numpy(dataset, feature_endpoint="fingerprint",
  """
  n_samples = len(dataset.keys())
  sample_datapoint = dataset.itervalues().next()
  n_features = len(sample_datapoint[feature_endpoint])
  n_features = np.size(sample_datapoint[feature_endpoint])
  n_targets = len(sample_datapoint[labels_endpoint])
  X = np.zeros((n_samples, n_features))
  y = np.zeros((n_samples, n_targets))
@@ -200,7 +200,7 @@ def dataset_to_numpy(dataset, feature_endpoint="fingerprint",
    datapoint = dataset[smiles] 
    fingerprint, labels  = (datapoint[feature_endpoint],
        datapoint[labels_endpoint])
    X[index] = np.array(fingerprint)
    X[index] = np.array(fingerprint).flatten()
    sorted_targets = sorted(labels.keys())
    # Set labels from measurements
    for t_ind, target in enumerate(sorted_targets):
@@ -243,7 +243,7 @@ def multitask_to_singletask(dataset):
        singletask[target][smiles] = datapoint_copy 
  return singletask

def split_dataset(dataset, splittype):
def split_dataset(dataset, splittype, seed=None):
  """Split provided data using specified method."""
  if splittype == "random":
    train, test = train_test_random_split(dataset, seed=seed)