Commit b1011fe3 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Local changes.

parent 4b4a572d
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -47,6 +47,7 @@ def process_multitask(paths, task_transforms, desc_transforms, splittype="random
  """
  dataset = load_and_transform_dataset(paths, task_transforms, desc_transforms,
      add_descriptors=add_descriptors, weight_positives=weight_positives)
  sorted_targets = sorted(dataset.keys())
  if splittype == "random":
    train, test = train_test_random_split(dataset, seed=seed)
  elif splittype == "scaffold":
+24 −22
Original line number Diff line number Diff line
@@ -15,8 +15,8 @@ from vs_utils.utils import SmilesGenerator
def parse_args(input_args=None):
  """Parse command-line arguments."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--xlsx', required=1,
                      help='Excel file with Globavir data.')
  parser.add_argument('--data', required=1,
                      help='Input file with data.')
  parser.add_argument("--name", required=1,
                      help="Name of the dataset.")
  parser.add_argument("--out", required=1,
@@ -28,7 +28,7 @@ def generate_directories(name, out):
  dataset_dir = os.path.join(out, name)
  if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)
  fingerprint_dir = os.path.join(dataset_dir, "circular-scaffold-smiles")
  fingerprint_dir = os.path.join(dataset_dir, "fingerprints")
  if not os.path.exists(fingerprint_dir):
    os.makedirs(fingerprint_dir)
  target_dir = os.path.join(dataset_dir, "targets")
@@ -58,19 +58,25 @@ def parse_float_input(val):

def generate_fingerprints(name, out):
  dataset_dir = os.path.join(out, name)
  fingerprint_dir = os.path.join(dataset_dir, "circular-scaffold-smiles")
  fingerprint_dir = os.path.join(dataset_dir, "fingerprints")
  shards_dir = os.path.join(dataset_dir, "shards")
  sdf = os.path.join(shards_dir, "%s-0.sdf.gz" % name)
  fingerprints = os.path.join(fingerprint_dir,
      "%s-circular-scaffolds-smiles.pkl.gz" % name)
      "%s-fingerprints.pkl.gz" % name)
  subprocess.call(["python", "-m", "vs_utils.scripts.featurize",
                   "--scaffolds", "--smiles",
                   sdf, fingerprints,
                   "circular", "--size", "1024"])

def globavir_specs():
  columns = ["compound_name", "isomeric_smiles", "tdo_ic50_nm", "tdo_Ki_nm",
    "tdo_percent_activity_10_um", "tdo_percent_activity_1_um", "ido_ic50_nm",
    "ido_Ki_nm", "ido_percent_activity_10_um", "ido_percent_activity_1_um"]
  column_types = ["string", "string", "float", "float", "float", "float",
      "float", "float", "float", "float"]

def generate_targets(xlsx_file, out_pkl, out_sdf):
  """Process Globavir xlsx file."""
def generate_targets(xlsx_file, columns, column_types, out_pkl, out_sdf):
  """Process input data file."""
  rows, mols = [], []
  W = px.load_workbook(xlsx_file, use_iterators=True)
  p = W.get_sheet_by_name(name="Sheet1")
@@ -80,20 +86,16 @@ def generate_targets(xlsx_file, out_pkl, out_sdf):
    if row_index == 0:
      continue
    row_data = [cell.internal_value for cell in row]
    # TODO(rbharath): Generalize this code to work for non-Globavir data. 
    row = {
      "compound_name": row_data[0],
      "isomeric_smiles": row_data[1],
      "tdo_ic50_nm": parse_float_input(row_data[5]),
      "tdo_Ki_nm": parse_float_input(row_data[6]),
      "tdo_percent_activity_10_um": parse_float_input(row_data[7]),
      "tdo_percent_activity_1_um": parse_float_input(row_data[8]),
      "ido_ic50_nm": parse_float_input(row_data[9]),
      "ido_Ki_nm": parse_float_input(row_data[10]),
      "ido_percent_activity_10_um": parse_float_input(row_data[11]),
      "ido_percent_activity_1_um": parse_float_input(row_data[12])
    }
    mol = Chem.MolFromSmiles(row["isomeric_smiles"])
    row = {}
    for ind, (column, column_type) in enumerate(zip(columns, column_types)):
      if column_type == "string":
        row[column] = row_data[ind]
      elif column_type == "float":
        row[column] = parse_float_input(row_data[ind])
      elif column_type == "float-array" and ind = len(columns) - 1:
        row[column] = np.array(row_data[ind:])

    mol = Chem.MolFromSmiles(row["smiles"])
    row["smiles"] = smiles.get_smiles(mol)
    mols.append(mol)
    rows.append(row)
@@ -111,7 +113,7 @@ def generate_targets(xlsx_file, out_pkl, out_sdf):
def main():
  args = parse_args()
  out_pkl, out_sdf = generate_directories(args.name, args.out)
  generate_targets(args.xlsx, out_pkl, out_sdf)
  generate_targets(args.data, out_pkl, out_sdf)
  generate_fingerprints(args.name, args.out)


+10 −1
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@ __copyright__ = "Copyright 2015, Stanford University"
__license__ = "LGPL"

import numpy as np
import warnings
from deep_chem.utils.analysis import summarize_distribution

def get_default_descriptor_transforms():
@@ -97,6 +98,7 @@ def balance_positives(y, W):
  n_samples, n_targets = np.shape(y)
  for target_ind in range(n_targets):
    positive_inds, negative_inds = [], []
    to_next_target = False
    for sample_ind in range(n_samples):
      label = y[sample_ind, target_ind]
      if label == 1:
@@ -106,8 +108,15 @@ def balance_positives(y, W):
      elif label == -1:  # Case of missing label
        continue
      else:
        raise ValueError("Labels must be 0/1 or -1 (missing data) for balance_positives.")
        warnings.warn("Labels must be 0/1 or -1 " +
                      "(missing data) for balance_positives target %d. " % target_ind +
                      "Continuing without balancing.")
        to_next_target = True
        break 
    if to_next_target:
      continue
    n_positives, n_negatives = len(positive_inds), len(negative_inds)
    print "For target %d, n_positives: %d, n_negatives: %d" % (target_ind, n_positives, n_negatives)
    pos_weight = float(n_negatives)/float(n_positives)
    W[positive_inds, target_ind] = pos_weight
    W[negative_inds, target_ind] = 1