Commit 913f3c03 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

More pylint for featurize

parent c2c07365
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -370,7 +370,7 @@ def _featurize_input(name, out, input_file, input_type, fields, field_types,
      input_file, input_type, fields, field_types, prediction_field,
      smiles_field, threshold, delimiter)
  print "Generating targets"
  generate_targets(df, mols, prediction_field, split_field,
  generate_targets(df, prediction_field, split_field,
                   smiles_field, id_field, out_y_pkl)
  print "Generating user-specified features"
  generate_features(df, feature_fields, smiles_field, id_field, out_x_pkl)
+60 −61
Original line number Diff line number Diff line
@@ -5,14 +5,11 @@ import os
import cPickle as pickle
import gzip
import functools
import itertools
import pandas as pd
import openpyxl as px
import numpy as np
import argparse
import csv
from rdkit import Chem
import subprocess
from vs_utils.utils import SmilesGenerator, ScaffoldGenerator
from vs_utils.features.fingerprints import CircularFingerprint
from vs_utils.features.basic import SimpleDescriptors
@@ -57,22 +54,21 @@ def parse_float_input(val):
    if ">" in val or "<" in val or "-" in val:
      return np.nan

def generate_vs_utils_features(df, name, out, smiles_field, id_field, featuretype):
def generate_vs_utils_features(dataframe, name, out, smiles_field, id_field, featuretype):
  """Generates circular fingerprints for dataset."""
  dataset_dir = os.path.join(out, name)
  feature_dir = os.path.join(dataset_dir, featuretype)
  features = os.path.join(feature_dir,
      "%s-%s.pkl.gz" % (name, featuretype))
  features = os.path.join(feature_dir, "%s-%s.pkl.gz" % (name, featuretype))

  feature_df = pd.DataFrame([])
  feature_df["smiles"] = df[[smiles_field]]
  feature_df["scaffolds"] = df[[smiles_field]].apply(
  feature_df["smiles"] = dataframe[[smiles_field]]
  feature_df["scaffolds"] = dataframe[[smiles_field]].apply(
      functools.partial(generate_scaffold, smiles_field=smiles_field),
      axis=1)
  feature_df["mol_id"] = df[[id_field]]
  feature_df["mol_id"] = dataframe[[id_field]]

  mols = []
  for row in df.iterrows():
  for row in dataframe.iterrows():
    # pandas rows are tuples (row_num, row_data)
    smiles = row[1][smiles_field]
    mols.append(Chem.MolFromSmiles(smiles))
@@ -85,8 +81,8 @@ def generate_vs_utils_features(df, name, out, smiles_field, id_field, featuretyp
  feature_df["features"] = pd.DataFrame(
      [{"features": feature} for feature in featurizer.featurize(mols)])

  with gzip.open(features, "wb") as f:
    pickle.dump(feature_df, f, pickle.HIGHEST_PROTOCOL)
  with gzip.open(features, "wb") as gzip_file:
    pickle.dump(feature_df, gzip_file, pickle.HIGHEST_PROTOCOL)

def get_rows(input_file, input_type, delimiter):
  """Returns an iterator over all rows in input_file"""
@@ -94,27 +90,28 @@ def get_rows(input_file, input_type, delimiter):
  # right option here might be to create a class which internally handles data
  # loading.
  if input_type == "xlsx":
    W = px.load_workbook(input_file, use_iterators=True)
    sheet_names = W.get_sheet_names()
    p = W.get_sheet_by_name(name=sheet_names[0])    # Take first sheet as the active sheet
    return p.iter_rows()
    workbook = px.load_workbook(input_file, use_iterators=True)
    sheet_names = workbook.get_sheet_names()
    # Take first sheet as the active sheet
    sheet = workbook.get_sheet_by_name(name=sheet_names[0])
    return sheet.iter_rows()
  elif input_type == "csv":
    with open(input_file, "rb") as f:
      reader = csv.reader(f, delimiter=delimiter)
    with open(input_file, "rb") as inp_file_obj:
      reader = csv.reader(inp_file_obj, delimiter=delimiter)
      return [row for row in reader]
  elif input_type == "pandas":
    with gzip.open(input_file) as f:
      df = pickle.load(f)
    return df.iterrows()
    with gzip.open(input_file) as inp_file_obj:
      dataframe = pickle.load(inp_file_obj)
    return dataframe.iterrows()
  elif input_type == "sdf":
    if ".gz" in input_file:
      with gzip.open(input_file) as f:
        supp = Chem.ForwardSDMolSupplier(f)
      with gzip.open(input_file) as inp_file_obj:
        supp = Chem.ForwardSDMolSupplier(inp_file_obj)
        mols = [mol for mol in supp if mol is not None]
      return mols
    else:
      with open(input_file) as f:
        supp  = Chem.ForwardSDMolSupplier(f)
      with open(input_file) as inp_file_obj:
        supp = Chem.ForwardSDMolSupplier(inp_file_obj)
        mols = [mol for mol in supp if mol is not None]
      return mols

@@ -158,7 +155,7 @@ def process_field(data, field_type):
  elif field_type == "float":
    return parse_float_input(data)
  elif field_type == "list-string":
    if type(data) == list:
    if isinstance(data, list):
      return data
    else:
      return data.split(",")
@@ -167,43 +164,45 @@ def process_field(data, field_type):
  elif field_type == "ndarray":
    return data

def generate_targets(df, mols, prediction_field, split_field,
def generate_targets(dataframe, prediction_field, split_field,
                     smiles_field, id_field, out_pkl):
  """Process input data file, generate labels, i.e. y"""
  #TODO(enf, rbharath): Modify package unique identifier to take user-specified
    #unique identifier instead of assuming smiles string
  labels_df = pd.DataFrame([])
  labels_df["mol_id"] = df[[id_field]]
  labels_df["smiles"] = df[[smiles_field]]
  labels_df["prediction"] = df[[prediction_field]]
  labels_df["mol_id"] = dataframe[[id_field]]
  labels_df["smiles"] = dataframe[[smiles_field]]
  labels_df["prediction"] = dataframe[[prediction_field]]
  if split_field is not None:
    labels_df["split"] = df[[split_field]]
    labels_df["split"] = dataframe[[split_field]]

  # Write pkl.gz file
  with gzip.open(out_pkl, "wb") as f:
    pickle.dump(labels_df, f, pickle.HIGHEST_PROTOCOL)
  with gzip.open(out_pkl, "wb") as pickle_file:
    pickle.dump(labels_df, pickle_file, pickle.HIGHEST_PROTOCOL)

def generate_scaffold(smiles_elt, include_chirality=False, smiles_field="smiles"):
  """Compute the Bemis-Murcko scaffold for a SMILES string."""
  smiles_string = smiles_elt[smiles_field]
  mol = Chem.MolFromSmiles(smiles_string)
  engine = ScaffoldGenerator(include_chirality=include_chirality)
  scaffold = engine.get_scaffold(mol)
  return(scaffold)
  return scaffold

def generate_features(df, feature_fields, smiles_field, id_field, out_pkl):
def generate_features(dataframe, feature_fields, smiles_field, id_field, out_pkl):
  """Puts user defined features into a standard directory structure."""
  if feature_fields is None:
    print("No feature field specified by user.")
    return

  features_df = pd.DataFrame([])
  features_df["smiles"] = df[[smiles_field]]
  features_df["scaffolds"] = df[[smiles_field]].apply(
  features_df["smiles"] = dataframe[[smiles_field]]
  features_df["scaffolds"] = dataframe[[smiles_field]].apply(
      functools.partial(generate_scaffold, smiles_field=smiles_field),
      axis=1)
  features_df["mol_id"] = df[[id_field]]
  features_df["mol_id"] = dataframe[[id_field]]

  features_data = []
  for row in df.iterrows():
  for row in dataframe.iterrows():
    # pandas rows are tuples (row_num, row_data)
    row, feature_list = row[1], []
    for feature in feature_fields:
@@ -211,8 +210,8 @@ def generate_features(df, feature_fields, smiles_field, id_field, out_pkl):
    features_data.append({"row": np.array(feature_list)})
  features_df["features"] = pd.DataFrame(features_data)

  with gzip.open(out_pkl, "wb") as f:
    pickle.dump(features_df, f, pickle.HIGHEST_PROTOCOL)
  with gzip.open(out_pkl, "wb") as pickle_file:
    pickle.dump(features_df, pickle_file, pickle.HIGHEST_PROTOCOL)

def extract_data(input_file, input_type, fields, field_types,
                 prediction_field, smiles_field, threshold, delimiter):
@@ -231,7 +230,7 @@ def extract_data(input_file, input_type, fields, field_types,
      colnames = get_colnames(raw_row, input_type)
      continue
    row, row_data = {}, get_row_data(raw_row, input_type, fields, smiles_field, colnames)
    for ind, (field, field_type) in enumerate(zip(fields, field_types)):
    for (field, field_type) in zip(fields, field_types):
      if field == prediction_field and threshold is not None:
        raw_val = process_field(row_data[field], field_type)
        row[field] = 1 if raw_val > threshold else 0
@@ -241,5 +240,5 @@ def extract_data(input_file, input_type, fields, field_types,
    row["smiles"] = smiles.get_smiles(mol)
    mols.append(mol)
    rows.append(row)
  df = pd.DataFrame(rows)
  return(df, mols)
  dataframe = pd.DataFrame(rows)
  return(dataframe, mols)