Commit 913f3c03 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

More pylint for featurize

parent c2c07365
Loading
Loading
Loading
Loading
+1 −1
Original line number Original line Diff line number Diff line
@@ -370,7 +370,7 @@ def _featurize_input(name, out, input_file, input_type, fields, field_types,
      input_file, input_type, fields, field_types, prediction_field,
      input_file, input_type, fields, field_types, prediction_field,
      smiles_field, threshold, delimiter)
      smiles_field, threshold, delimiter)
  print "Generating targets"
  print "Generating targets"
  generate_targets(df, mols, prediction_field, split_field,
  generate_targets(df, prediction_field, split_field,
                   smiles_field, id_field, out_y_pkl)
                   smiles_field, id_field, out_y_pkl)
  print "Generating user-specified features"
  print "Generating user-specified features"
  generate_features(df, feature_fields, smiles_field, id_field, out_x_pkl)
  generate_features(df, feature_fields, smiles_field, id_field, out_x_pkl)
+60 −61
Original line number Original line Diff line number Diff line
@@ -5,14 +5,11 @@ import os
import cPickle as pickle
import cPickle as pickle
import gzip
import gzip
import functools
import functools
import itertools
import pandas as pd
import pandas as pd
import openpyxl as px
import openpyxl as px
import numpy as np
import numpy as np
import argparse
import csv
import csv
from rdkit import Chem
from rdkit import Chem
import subprocess
from vs_utils.utils import SmilesGenerator, ScaffoldGenerator
from vs_utils.utils import SmilesGenerator, ScaffoldGenerator
from vs_utils.features.fingerprints import CircularFingerprint
from vs_utils.features.fingerprints import CircularFingerprint
from vs_utils.features.basic import SimpleDescriptors
from vs_utils.features.basic import SimpleDescriptors
@@ -57,22 +54,21 @@ def parse_float_input(val):
    if ">" in val or "<" in val or "-" in val:
    if ">" in val or "<" in val or "-" in val:
      return np.nan
      return np.nan


def generate_vs_utils_features(df, name, out, smiles_field, id_field, featuretype):
def generate_vs_utils_features(dataframe, name, out, smiles_field, id_field, featuretype):
  """Generates circular fingerprints for dataset."""
  """Generates circular fingerprints for dataset."""
  dataset_dir = os.path.join(out, name)
  dataset_dir = os.path.join(out, name)
  feature_dir = os.path.join(dataset_dir, featuretype)
  feature_dir = os.path.join(dataset_dir, featuretype)
  features = os.path.join(feature_dir,
  features = os.path.join(feature_dir, "%s-%s.pkl.gz" % (name, featuretype))
      "%s-%s.pkl.gz" % (name, featuretype))


  feature_df = pd.DataFrame([])
  feature_df = pd.DataFrame([])
  feature_df["smiles"] = df[[smiles_field]]
  feature_df["smiles"] = dataframe[[smiles_field]]
  feature_df["scaffolds"] = df[[smiles_field]].apply(
  feature_df["scaffolds"] = dataframe[[smiles_field]].apply(
      functools.partial(generate_scaffold, smiles_field=smiles_field),
      functools.partial(generate_scaffold, smiles_field=smiles_field),
      axis=1)
      axis=1)
  feature_df["mol_id"] = df[[id_field]]
  feature_df["mol_id"] = dataframe[[id_field]]


  mols = []
  mols = []
  for row in df.iterrows():
  for row in dataframe.iterrows():
    # pandas rows are tuples (row_num, row_data)
    # pandas rows are tuples (row_num, row_data)
    smiles = row[1][smiles_field]
    smiles = row[1][smiles_field]
    mols.append(Chem.MolFromSmiles(smiles))
    mols.append(Chem.MolFromSmiles(smiles))
@@ -85,8 +81,8 @@ def generate_vs_utils_features(df, name, out, smiles_field, id_field, featuretyp
  feature_df["features"] = pd.DataFrame(
  feature_df["features"] = pd.DataFrame(
      [{"features": feature} for feature in featurizer.featurize(mols)])
      [{"features": feature} for feature in featurizer.featurize(mols)])


  with gzip.open(features, "wb") as f:
  with gzip.open(features, "wb") as gzip_file:
    pickle.dump(feature_df, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(feature_df, gzip_file, pickle.HIGHEST_PROTOCOL)


def get_rows(input_file, input_type, delimiter):
def get_rows(input_file, input_type, delimiter):
  """Returns an iterator over all rows in input_file"""
  """Returns an iterator over all rows in input_file"""
@@ -94,27 +90,28 @@ def get_rows(input_file, input_type, delimiter):
  # right option here might be to create a class which internally handles data
  # right option here might be to create a class which internally handles data
  # loading.
  # loading.
  if input_type == "xlsx":
  if input_type == "xlsx":
    W = px.load_workbook(input_file, use_iterators=True)
    workbook = px.load_workbook(input_file, use_iterators=True)
    sheet_names = W.get_sheet_names()
    sheet_names = workbook.get_sheet_names()
    p = W.get_sheet_by_name(name=sheet_names[0])    # Take first sheet as the active sheet
    # Take first sheet as the active sheet
    return p.iter_rows()
    sheet = workbook.get_sheet_by_name(name=sheet_names[0])
    return sheet.iter_rows()
  elif input_type == "csv":
  elif input_type == "csv":
    with open(input_file, "rb") as f:
    with open(input_file, "rb") as inp_file_obj:
      reader = csv.reader(f, delimiter=delimiter)
      reader = csv.reader(inp_file_obj, delimiter=delimiter)
      return [row for row in reader]
      return [row for row in reader]
  elif input_type == "pandas":
  elif input_type == "pandas":
    with gzip.open(input_file) as f:
    with gzip.open(input_file) as inp_file_obj:
      df = pickle.load(f)
      dataframe = pickle.load(inp_file_obj)
    return df.iterrows()
    return dataframe.iterrows()
  elif input_type == "sdf":
  elif input_type == "sdf":
    if ".gz" in input_file:
    if ".gz" in input_file:
      with gzip.open(input_file) as f:
      with gzip.open(input_file) as inp_file_obj:
        supp = Chem.ForwardSDMolSupplier(f)
        supp = Chem.ForwardSDMolSupplier(inp_file_obj)
        mols = [mol for mol in supp if mol is not None]
        mols = [mol for mol in supp if mol is not None]
      return mols
      return mols
    else:
    else:
      with open(input_file) as f:
      with open(input_file) as inp_file_obj:
        supp  = Chem.ForwardSDMolSupplier(f)
        supp = Chem.ForwardSDMolSupplier(inp_file_obj)
        mols = [mol for mol in supp if mol is not None]
        mols = [mol for mol in supp if mol is not None]
      return mols
      return mols


@@ -158,7 +155,7 @@ def process_field(data, field_type):
  elif field_type == "float":
  elif field_type == "float":
    return parse_float_input(data)
    return parse_float_input(data)
  elif field_type == "list-string":
  elif field_type == "list-string":
    if type(data) == list:
    if isinstance(data, list):
      return data
      return data
    else:
    else:
      return data.split(",")
      return data.split(",")
@@ -167,43 +164,45 @@ def process_field(data, field_type):
  elif field_type == "ndarray":
  elif field_type == "ndarray":
    return data
    return data


def generate_targets(df, mols, prediction_field, split_field,
def generate_targets(dataframe, prediction_field, split_field,
                     smiles_field, id_field, out_pkl):
                     smiles_field, id_field, out_pkl):
  """Process input data file, generate labels, i.e. y"""
  """Process input data file, generate labels, i.e. y"""
  #TODO(enf, rbharath): Modify package unique identifier to take user-specified
  #TODO(enf, rbharath): Modify package unique identifier to take user-specified
    #unique identifier instead of assuming smiles string
    #unique identifier instead of assuming smiles string
  labels_df = pd.DataFrame([])
  labels_df = pd.DataFrame([])
  labels_df["mol_id"] = df[[id_field]]
  labels_df["mol_id"] = dataframe[[id_field]]
  labels_df["smiles"] = df[[smiles_field]]
  labels_df["smiles"] = dataframe[[smiles_field]]
  labels_df["prediction"] = df[[prediction_field]]
  labels_df["prediction"] = dataframe[[prediction_field]]
  if split_field is not None:
  if split_field is not None:
    labels_df["split"] = df[[split_field]]
    labels_df["split"] = dataframe[[split_field]]


  # Write pkl.gz file
  # Write pkl.gz file
  with gzip.open(out_pkl, "wb") as f:
  with gzip.open(out_pkl, "wb") as pickle_file:
    pickle.dump(labels_df, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(labels_df, pickle_file, pickle.HIGHEST_PROTOCOL)


def generate_scaffold(smiles_elt, include_chirality=False, smiles_field="smiles"):
def generate_scaffold(smiles_elt, include_chirality=False, smiles_field="smiles"):
  """Compute the Bemis-Murcko scaffold for a SMILES string."""
  smiles_string = smiles_elt[smiles_field]
  smiles_string = smiles_elt[smiles_field]
  mol = Chem.MolFromSmiles(smiles_string)
  mol = Chem.MolFromSmiles(smiles_string)
  engine = ScaffoldGenerator(include_chirality=include_chirality)
  engine = ScaffoldGenerator(include_chirality=include_chirality)
  scaffold = engine.get_scaffold(mol)
  scaffold = engine.get_scaffold(mol)
  return(scaffold)
  return scaffold


def generate_features(df, feature_fields, smiles_field, id_field, out_pkl):
def generate_features(dataframe, feature_fields, smiles_field, id_field, out_pkl):
  """Puts user defined features into a standard directory structure."""
  if feature_fields is None:
  if feature_fields is None:
    print("No feature field specified by user.")
    print("No feature field specified by user.")
    return
    return


  features_df = pd.DataFrame([])
  features_df = pd.DataFrame([])
  features_df["smiles"] = df[[smiles_field]]
  features_df["smiles"] = dataframe[[smiles_field]]
  features_df["scaffolds"] = df[[smiles_field]].apply(
  features_df["scaffolds"] = dataframe[[smiles_field]].apply(
      functools.partial(generate_scaffold, smiles_field=smiles_field),
      functools.partial(generate_scaffold, smiles_field=smiles_field),
      axis=1)
      axis=1)
  features_df["mol_id"] = df[[id_field]]
  features_df["mol_id"] = dataframe[[id_field]]


  features_data = []
  features_data = []
  for row in df.iterrows():
  for row in dataframe.iterrows():
    # pandas rows are tuples (row_num, row_data)
    # pandas rows are tuples (row_num, row_data)
    row, feature_list = row[1], []
    row, feature_list = row[1], []
    for feature in feature_fields:
    for feature in feature_fields:
@@ -211,8 +210,8 @@ def generate_features(df, feature_fields, smiles_field, id_field, out_pkl):
    features_data.append({"row": np.array(feature_list)})
    features_data.append({"row": np.array(feature_list)})
  features_df["features"] = pd.DataFrame(features_data)
  features_df["features"] = pd.DataFrame(features_data)


  with gzip.open(out_pkl, "wb") as f:
  with gzip.open(out_pkl, "wb") as pickle_file:
    pickle.dump(features_df, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(features_df, pickle_file, pickle.HIGHEST_PROTOCOL)


def extract_data(input_file, input_type, fields, field_types,
def extract_data(input_file, input_type, fields, field_types,
                 prediction_field, smiles_field, threshold, delimiter):
                 prediction_field, smiles_field, threshold, delimiter):
@@ -231,7 +230,7 @@ def extract_data(input_file, input_type, fields, field_types,
      colnames = get_colnames(raw_row, input_type)
      colnames = get_colnames(raw_row, input_type)
      continue
      continue
    row, row_data = {}, get_row_data(raw_row, input_type, fields, smiles_field, colnames)
    row, row_data = {}, get_row_data(raw_row, input_type, fields, smiles_field, colnames)
    for ind, (field, field_type) in enumerate(zip(fields, field_types)):
    for (field, field_type) in zip(fields, field_types):
      if field == prediction_field and threshold is not None:
      if field == prediction_field and threshold is not None:
        raw_val = process_field(row_data[field], field_type)
        raw_val = process_field(row_data[field], field_type)
        row[field] = 1 if raw_val > threshold else 0
        row[field] = 1 if raw_val > threshold else 0
@@ -241,5 +240,5 @@ def extract_data(input_file, input_type, fields, field_types,
    row["smiles"] = smiles.get_smiles(mol)
    row["smiles"] = smiles.get_smiles(mol)
    mols.append(mol)
    mols.append(mol)
    rows.append(row)
    rows.append(row)
  df = pd.DataFrame(rows)
  dataframe = pd.DataFrame(rows)
  return(df, mols)
  return(dataframe, mols)