Commit 63ada38a authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Merge pull request #27 from rbharath/quick

Model Command Added
parents b5a1f89e 913f3c03
Loading
Loading
Loading
Loading
+18 −24
Original line number Diff line number Diff line
"""
Code for processing datasets using scikit-learn.
"""
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import MultiTaskLasso 
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoLarsCV
from sklearn.svm import SVR

def fit_singletask_models(train_data, modeltype, task_types):
def fit_singletask_models(train_data, modeltype):
  """Fits singletask linear regression models to potency.

  Parameters
@@ -26,23 +23,20 @@ def fit_singletask_models(train_data, modeltype, task_types):
    Type of split for train/test. Either random or scaffold.
  seed: int (optional)
    Seed to initialize np.random.
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  output_transforms: dict
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  """
  models = {}
  for index, target in enumerate(sorted(train_data.keys())):
    print "Building model %d" % index
    (_, X_train, y_train, W_train) = train_data[target]
  for target in sorted(train_data.keys()):
    print "Building model for target %s" % target
    (_, X_train, y_train, _) = train_data[target]
    if modeltype == "rf_regressor":
      model = RandomForestRegressor(n_estimators=500, n_jobs=-1,
          warm_start=True, max_features="sqrt")
      model = RandomForestRegressor(
          n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
    elif modeltype == "rf_classifier":
      model = RandomForestClassifier(n_estimators=500, n_jobs=-1,
          warm_start=True, max_features="sqrt")
      model = RandomForestClassifier(
          n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
    elif modeltype == "logistic":
      model = LogisticRegression(class_weight="auto")
    elif modeltype == "linear":
@@ -63,11 +57,11 @@ def fit_singletask_models(train_data, modeltype, task_types):

# TODO(rbharath): I believe this is broken. Update it to work with the rest of
# the package.
def fit_multitask_rf(train_data, task_types):
def fit_multitask_rf(train_data):
  """Fits a multitask RF model to provided dataset.
  """
  (_, X_train, y_train, _) = train_data
  model = RandomForestClassifier(n_estimators=100, n_jobs=-1,
      class_weight="auto")
  model = RandomForestClassifier(
      n_estimators=100, n_jobs=-1, class_weight="auto")
  model.fit(X_train, y_train)
  return model
+447 −206

File changed.

Preview size limit exceeded, changes collapsed.

+64 −55
Original line number Diff line number Diff line
"""
Utility functions to evaluate models on datasets.
"""
from __future__ import print_function

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2015, Stanford University"
__license__ = "LGPL"
@@ -8,7 +10,7 @@ __license__ = "LGPL"
import csv
import numpy as np
import warnings
from deep_chem.utils.preprocess import dataset_to_numpy
import sys
from deep_chem.utils.preprocess import labels_to_weights
from deep_chem.utils.preprocess import undo_transform_outputs
from sklearn.metrics import mean_squared_error
@@ -17,21 +19,26 @@ from sklearn.metrics import r2_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from rdkit import Chem
from rdkit.Chem.Descriptors import ExactMolWt

def compute_model_performance(raw_test_data, test_data, task_types, models, modeltype,
    output_transforms, aucs=True, r2s=False, rms=False, recall=False, accuracy=False, mcc=False):
def compute_model_performance(raw_test_data, test_data, task_types, models,
                              modeltype, output_transforms, aucs=True,
                              r2s=False, rms=False, recall=False,
                              accuracy=False, mcc=False,
                              print_file=sys.stdout):
  """Computes statistics for model performance on test set."""
  all_results, auc_vals, r2_vals, rms_vals, mcc_vals, recall_vals, accuracy_vals = {}, {}, {}, {}, {}, {}, {}
  all_results = {}
  auc_vals, mcc_vals, recall_vals, accuracy_vals = {}, {}, {}, {}
  r2_vals, rms_vals = {}, {}
  for index, target in enumerate(sorted(test_data.keys())):
    print "Evaluating model %d" % index
    print "Target %s" % target
    (test_ids, Xtest, ytest, wtest) = test_data[target]
    print("Evaluating model %d" % index, file=print_file)
    print("Target %s" % target, file=print_file)
    (test_ids, X_test, y_test, w_test) = test_data[target]
    (_, _, ytest_raw, _) = raw_test_data[target]
    model = models[target]
    results = eval_model(test_ids, Xtest, ytest, ytest_raw, wtest, model, {target: task_types[target]}, 
                         modeltype=modeltype, output_transforms=output_transforms)
    results = eval_model(
        test_ids, X_test, y_test, ytest_raw, w_test, model,
        {target: task_types[target]}, modeltype=modeltype,
        output_transforms=output_transforms)
    all_results[target] = results[target]
    if aucs:
      auc_vals.update(compute_roc_auc_scores(results, task_types))
@@ -47,17 +54,17 @@ def compute_model_performance(raw_test_data, test_data, task_types, models, mode
      recall_vals.update(compute_accuracy_score(results, task_types))

  if aucs:
    print "Mean AUC: %f" % np.mean(np.array(auc_vals.values()))
    print("Mean AUC: %f" % np.mean(np.array(auc_vals.values())), file=print_file)
  if r2s:
    print "Mean R^2: %f" % np.mean(np.array(r2_vals.values()))
    print("Mean R^2: %f" % np.mean(np.array(r2_vals.values())), file=print_file)
  if rms:
    print "Mean RMS: %f" % np.mean(np.array(rms_vals.values()))
    print("Mean RMS: %f" % np.mean(np.array(rms_vals.values())), file=print_file)
  if mcc:
    print "Mean MCC: %f" % np.mean(np.array(mcc_vals.values()))
    print("Mean MCC: %f" % np.mean(np.array(mcc_vals.values())), file=print_file)
  if recall:
    print "Mean Recall: %f" % np.mean(np.array(recall_vals.values()))
    print("Mean Recall: %f" % np.mean(np.array(recall_vals.values())), file=print_file)
  if accuracy:
    print "Mean Accuracy: %f" % np.mean(np.array(accuracy_vals.values()))
    print("Mean Accuracy: %f" % np.mean(np.array(accuracy_vals.values())), file=print_file)

  return all_results, aucs, r2s, rms

@@ -88,7 +95,8 @@ def model_predictions(X, model, n_targets, task_types, modeltype="sklearn"):
  # an upstream change so the evaluator doesn't have to worry about this.
  if len(np.shape(X)) > 2:  # Dealing with 3D data
    if len(np.shape(X)) != 5:
      raise ValueError("Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
      raise ValueError(
          "Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
    (n_samples, axis_length, _, _, n_channels) = np.shape(X)
    X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
  if modeltype == "keras-graph":
@@ -107,13 +115,14 @@ def model_predictions(X, model, n_targets, task_types, modeltype="sklearn"):
    ypreds = model.predict(X)
  else:
    raise ValueError("Improper modeltype.")
  if type(ypreds) == np.ndarray:
  if isinstance(ypreds, np.ndarray):
    ypreds = np.squeeze(ypreds)
  if type(ypreds) != list:
  if not isinstance(ypreds, list):
    ypreds = [ypreds]
  return ypreds

def eval_model(ids, X, Ytrue, Ytrue_raw, W, model, task_types, output_transforms, modeltype="sklearn"):
def eval_model(ids, X, Ytrue, Ytrue_raw, W, model, task_types,
               output_transforms, modeltype="sklearn"):
  """Evaluates the provided model on the test-set.

  Returns a dict which maps target-names to pairs of np.ndarrays (ytrue,
@@ -135,11 +144,11 @@ def eval_model(ids, X, Ytrue, Ytrue_raw, W, model, task_types, output_transforms
    Either sklearn, keras, or keras_multitask
  """
  sorted_targets = sorted(task_types.keys())
  ypreds = model_predictions(X, model, len(task_types),
      task_types, modeltype=modeltype)
  ypreds = model_predictions(
      X, model, len(task_types), task_types, modeltype=modeltype)
  results = {}
  for target_ind, target in enumerate(sorted_targets):
    ytrue_raw, ytrue, ypred = Ytrue_raw[:, target_ind], Ytrue[:, target_ind], ypreds[target_ind]
    ytrue_raw, _, ypred = Ytrue_raw[:, target_ind], Ytrue[:, target_ind], ypreds[target_ind]
    ypred = undo_transform_outputs(ytrue_raw, ypred, output_transforms)
    results[target] = (ids, np.squeeze(ytrue_raw), np.squeeze(ypred))
  return results
@@ -151,14 +160,14 @@ def results_to_csv(results, out, task_type="classification"):
    if task_type == "classification":
      yscores = np.around(yscores[:, 1]).astype(int)
    elif task_type == "regression":
      if type(yscores[0]) == np.ndarray:
      if isinstance(yscores[0], np.ndarray):
        yscores = yscores[:, 0]
    with open(out, "wb") as csvfile:
      csvwriter = csv.writer(csvfile, delimiter="\t")
      csvwriter.writerow(["Ids", "True", "Model-Prediction"])
      for id, ytrue, yscore in zip(mol_ids, ytrues, yscores):
        csvwriter.writerow([id, ytrue, yscore])
    print "Writing results on test set for target %s to %s" % (target, out)
      for mol_id, ytrue, yscore in zip(mol_ids, ytrues, yscores):
        csvwriter.writerow([mol_id, ytrue, yscore])
    print("Writing results on test set for target %s to %s" % (target, out))


def compute_r2_scores(results, task_types):
@@ -179,7 +188,7 @@ def compute_r2_scores(results, task_types):
      continue
    _, ytrue, yscore = results[target]
    score = r2_score(ytrue, yscore)
    print "Target %s: R^2 %f" % (target, score)
    print("Target %s: R^2 %f" % (target, score))
    scores[target] = score
  return scores

@@ -201,7 +210,7 @@ def compute_rms_scores(results, task_types):
      continue
    _, ytrue, yscore = results[target]
    rms = np.sqrt(mean_squared_error(ytrue, yscore))
    print "Target %s: RMS %f" % (target, rms)
    print("Target %s: RMS %f" % (target, rms))
    scores[target] = rms
  return scores

@@ -223,10 +232,10 @@ def compute_roc_auc_scores(results, task_types):
    sample_weights = labels_to_weights(ytrue)
    try:
      score = roc_auc_score(ytrue, yscore[:, 1], sample_weight=sample_weights)
    except Exception as e:
    except Exception:
      warnings.warn("ROC AUC score calculation failed.")
      score = 0.5
    print "Target %s: AUC %f" % (target, score)
    print("Target %s: AUC %f" % (target, score))
    scores[target] = score
  return scores

@@ -238,7 +247,7 @@ def compute_matthews_corr(results, task_types):
      continue
    _, ytrue, ypred = results[target]
    mcc = matthews_corrcoef(ytrue, np.around(ypred[:, 1]))
    print "Target %s: MCC %f" % (target, mcc)
    print("Target %s: MCC %f" % (target, mcc))
    scores[target] = mcc
  return scores

@@ -250,7 +259,7 @@ def compute_recall_score(results, task_types):
      continue
    _, ytrue, ypred = results[target]
    recall = recall_score(ytrue, np.around(ypred[:, 1]))
    print "Target %s: Recall %f" % (target, recall)
    print("Target %s: Recall %f" % (target, recall))
    scores[target] = recall
  return scores

@@ -262,6 +271,6 @@ def compute_accuracy_score(results, task_types):
      continue
    _, ytrue, ypred = results[target]
    accuracy = accuracy_score(ytrue, np.around(ypred[:, 1]))
    print "Target %s: Accuracy %f" % (target, accuracy)
    print("Target %s: Accuracy %f" % (target, accuracy))
    scores[target] = accuracy
  return scores
+79 −81
Original line number Diff line number Diff line
@@ -5,19 +5,16 @@ import os
import cPickle as pickle
import gzip
import functools
import itertools
import pandas as pd
import openpyxl as px
import numpy as np
import argparse
import csv
from rdkit import Chem
import subprocess
from vs_utils.utils import SmilesGenerator, ScaffoldGenerator
from vs_utils.features.fingerprints import CircularFingerprint
from vs_utils.features.basic import SimpleDescriptors

def generate_directories(name, out, feature_endpoints):
def generate_directories(name, out, feature_fields):
  """Generate directory structure for featurized dataset."""
  dataset_dir = os.path.join(out, name)
  if not os.path.exists(dataset_dir):
@@ -31,17 +28,17 @@ def generate_directories(name, out, feature_endpoints):
  target_dir = os.path.join(dataset_dir, "targets")
  if not os.path.exists(target_dir):
    os.makedirs(target_dir)
  if feature_endpoints is not None:
    feature_endpoint_dir = os.path.join(dataset_dir, "features")
    if not os.path.exists(feature_endpoint_dir):
      os.makedirs(feature_endpoint_dir)
  if feature_fields is not None:
    feature_field_dir = os.path.join(dataset_dir, "features")
    if not os.path.exists(feature_field_dir):
      os.makedirs(feature_field_dir)

  # Return names of files to be generated
  # TODO(rbharath): Explicitly passing around out_*_pkl is an encapsulation
  # failure. Remove this.
  out_y_pkl = os.path.join(target_dir, "%s.pkl.gz" % name)
  out_x_pkl = (os.path.join(feature_endpoint_dir, "%s-features.pkl.gz" %name)
      if feature_endpoints is not None else None)
  out_x_pkl = (os.path.join(feature_field_dir, "%s-features.pkl.gz" %name)
               if feature_fields is not None else None)
  return out_x_pkl, out_y_pkl

def parse_float_input(val):
@@ -57,24 +54,23 @@ def parse_float_input(val):
    if ">" in val or "<" in val or "-" in val:
      return np.nan

def generate_vs_utils_features(df, name, out, smiles_endpoint, id_endpoint, featuretype):
def generate_vs_utils_features(dataframe, name, out, smiles_field, id_field, featuretype):
  """Generates circular fingerprints for dataset."""
  dataset_dir = os.path.join(out, name)
  feature_dir = os.path.join(dataset_dir, featuretype)
  features = os.path.join(feature_dir,
      "%s-%s.pkl.gz" % (name, featuretype))
  features = os.path.join(feature_dir, "%s-%s.pkl.gz" % (name, featuretype))

  feature_df = pd.DataFrame([])
  feature_df["smiles"] = df[[smiles_endpoint]]
  feature_df["scaffolds"] = df[[smiles_endpoint]].apply(
    functools.partial(generate_scaffold, smiles_endpoint=smiles_endpoint),
  feature_df["smiles"] = dataframe[[smiles_field]]
  feature_df["scaffolds"] = dataframe[[smiles_field]].apply(
      functools.partial(generate_scaffold, smiles_field=smiles_field),
      axis=1)
  feature_df["mol_id"] = df[[id_endpoint]]
  feature_df["mol_id"] = dataframe[[id_field]]

  mols = []
  for row in df.iterrows():
  for row in dataframe.iterrows():
    # pandas rows are tuples (row_num, row_data)
    smiles = row[1][smiles_endpoint]
    smiles = row[1][smiles_field]
    mols.append(Chem.MolFromSmiles(smiles))
  if featuretype == "fingerprints":
    featurizer = CircularFingerprint(size=1024)
@@ -85,8 +81,8 @@ def generate_vs_utils_features(df, name, out, smiles_endpoint, id_endpoint, feat
  feature_df["features"] = pd.DataFrame(
      [{"features": feature} for feature in featurizer.featurize(mols)])

  with gzip.open(features, "wb") as f:
    pickle.dump(feature_df, f, pickle.HIGHEST_PROTOCOL)
  with gzip.open(features, "wb") as gzip_file:
    pickle.dump(feature_df, gzip_file, pickle.HIGHEST_PROTOCOL)

def get_rows(input_file, input_type, delimiter):
  """Returns an iterator over all rows in input_file"""
@@ -94,27 +90,28 @@ def get_rows(input_file, input_type, delimiter):
  # right option here might be to create a class which internally handles data
  # loading.
  if input_type == "xlsx":
    W = px.load_workbook(input_file, use_iterators=True)
    sheet_names = W.get_sheet_names()
    p = W.get_sheet_by_name(name=sheet_names[0])    # Take first sheet as the active sheet
    return p.iter_rows()
    workbook = px.load_workbook(input_file, use_iterators=True)
    sheet_names = workbook.get_sheet_names()
    # Take first sheet as the active sheet
    sheet = workbook.get_sheet_by_name(name=sheet_names[0])
    return sheet.iter_rows()
  elif input_type == "csv":
    with open(input_file, "rb") as f:
      reader = csv.reader(f, delimiter=delimiter)
    with open(input_file, "rb") as inp_file_obj:
      reader = csv.reader(inp_file_obj, delimiter=delimiter)
      return [row for row in reader]
  elif input_type == "pandas":
    with gzip.open(input_file) as f:
      df = pickle.load(f)
    return df.iterrows()
    with gzip.open(input_file) as inp_file_obj:
      dataframe = pickle.load(inp_file_obj)
    return dataframe.iterrows()
  elif input_type == "sdf":
    if ".gz" in input_file:
      with gzip.open(input_file) as f:
        supp = Chem.ForwardSDMolSupplier(f)
      with gzip.open(input_file) as inp_file_obj:
        supp = Chem.ForwardSDMolSupplier(inp_file_obj)
        mols = [mol for mol in supp if mol is not None]
      return mols
    else:
      with open(input_file) as f:
        supp  = Chem.ForwardSDMolSupplier(f)
      with open(input_file) as inp_file_obj:
        supp = Chem.ForwardSDMolSupplier(inp_file_obj)
        mols = [mol for mol in supp if mol is not None]
      return mols

@@ -125,7 +122,7 @@ def get_colnames(row, input_type):
  elif input_type == "csv":
    return row

def get_row_data(row, input_type, fields, smiles_endpoint, colnames=None):
def get_row_data(row, input_type, fields, smiles_field, colnames=None):
  """Extract information from row data."""
  row_data = {}
  if input_type == "xlsx":
@@ -142,11 +139,10 @@ def get_row_data(row, input_type, fields, smiles_endpoint, colnames=None):
    for field in fields:
      row_data[field] = row[field]
  elif input_type == "sdf":
    mol = {}
    mol = row
    for field in fields:
      if field == smiles_endpoint:
        row_data[field] = Chem.MolToSmiles(mol)
      elif not mol.HasProp(field):
      row_data[smiles_field] = Chem.MolToSmiles(mol)
      if not mol.HasProp(field):
        row_data[field] = None
      else:
        row_data[field] = mol.GetProp(field)
@@ -159,7 +155,7 @@ def process_field(data, field_type):
  elif field_type == "float":
    return parse_float_input(data)
  elif field_type == "list-string":
    if type(data) == list:
    if isinstance(data, list):
      return data
    else:
      return data.split(",")
@@ -168,55 +164,57 @@ def process_field(data, field_type):
  elif field_type == "ndarray":
    return data

def generate_targets(df, mols, prediction_endpoint, split_endpoint,
    smiles_endpoint, id_endpoint, out_pkl):
def generate_targets(dataframe, prediction_field, split_field,
                     smiles_field, id_field, out_pkl):
  """Process input data file, generate labels, i.e. y"""
  #TODO(enf, rbharath): Modify package unique identifier to take user-specified
    #unique identifier instead of assuming smiles string
  labels_df = pd.DataFrame([])
  labels_df["mol_id"] = df[[id_endpoint]]
  labels_df["smiles"] = df[[smiles_endpoint]]
  labels_df["prediction"] = df[[prediction_endpoint]]
  if split_endpoint is not None:
    labels_df["split"] = df[[split_endpoint]]
  labels_df["mol_id"] = dataframe[[id_field]]
  labels_df["smiles"] = dataframe[[smiles_field]]
  labels_df["prediction"] = dataframe[[prediction_field]]
  if split_field is not None:
    labels_df["split"] = dataframe[[split_field]]

  # Write pkl.gz file
  with gzip.open(out_pkl, "wb") as f:
    pickle.dump(labels_df, f, pickle.HIGHEST_PROTOCOL)
  with gzip.open(out_pkl, "wb") as pickle_file:
    pickle.dump(labels_df, pickle_file, pickle.HIGHEST_PROTOCOL)

def generate_scaffold(smiles_elt, include_chirality=False, smiles_endpoint="smiles"):
  smiles_string = smiles_elt[smiles_endpoint]
def generate_scaffold(smiles_elt, include_chirality=False, smiles_field="smiles"):
  """Compute the Bemis-Murcko scaffold for a SMILES string."""
  smiles_string = smiles_elt[smiles_field]
  mol = Chem.MolFromSmiles(smiles_string)
  engine = ScaffoldGenerator(include_chirality=include_chirality)
  scaffold = engine.get_scaffold(mol)
  return(scaffold)
  return scaffold

def generate_features(df, feature_endpoints, smiles_endpoint, id_endpoint, out_pkl):
  if feature_endpoints is None:
    print("No feature endpoint specified by user.")
def generate_features(dataframe, feature_fields, smiles_field, id_field, out_pkl):
  """Puts user defined features into a standard directory structure."""
  if feature_fields is None:
    print("No feature field specified by user.")
    return

  features_df = pd.DataFrame([])
  features_df["smiles"] = df[[smiles_endpoint]]
  features_df["scaffolds"] = df[[smiles_endpoint]].apply(
    functools.partial(generate_scaffold, smiles_endpoint=smiles_endpoint),
  features_df["smiles"] = dataframe[[smiles_field]]
  features_df["scaffolds"] = dataframe[[smiles_field]].apply(
      functools.partial(generate_scaffold, smiles_field=smiles_field),
      axis=1)
  features_df["mol_id"] = df[[id_endpoint]]
  features_df["mol_id"] = dataframe[[id_field]]

  features_data = []
  for row in df.iterrows():
  for row in dataframe.iterrows():
    # pandas rows are tuples (row_num, row_data)
    row, feature_list = row[1], []
    for feature in feature_endpoints:
    for feature in feature_fields:
      feature_list.append(row[feature])
    features_data.append({"row": np.array(feature_list)})
  features_df["features"] = pd.DataFrame(features_data)

  with gzip.open(out_pkl, "wb") as f:
    pickle.dump(features_df, f, pickle.HIGHEST_PROTOCOL)
  with gzip.open(out_pkl, "wb") as pickle_file:
    pickle.dump(features_df, pickle_file, pickle.HIGHEST_PROTOCOL)

def extract_data(input_file, input_type, fields, field_types,
      prediction_endpoint, smiles_endpoint, threshold, delimiter):
                 prediction_field, smiles_field, threshold, delimiter):
  """Extracts data from input as Pandas data frame"""
  rows, mols, smiles = [], [], SmilesGenerator()
  colnames = []
@@ -231,16 +229,16 @@ def extract_data(input_file, input_type, fields, field_types,
    if (input_type == "xlsx" or input_type == "csv") and row_index == 0:
      colnames = get_colnames(raw_row, input_type)
      continue
    row, row_data = {}, get_row_data(raw_row, input_type, fields, smiles_endpoint, colnames)
    for ind, (field, field_type) in enumerate(zip(fields, field_types)):
      if field == prediction_endpoint and threshold is not None:
    row, row_data = {}, get_row_data(raw_row, input_type, fields, smiles_field, colnames)
    for (field, field_type) in zip(fields, field_types):
      if field == prediction_field and threshold is not None:
        raw_val = process_field(row_data[field], field_type)
        row[field] = 1 if raw_val > threshold else 0
      else:
        row[field] = process_field(row_data[field], field_type)
    mol = Chem.MolFromSmiles(row[smiles_endpoint])
    mol = Chem.MolFromSmiles(row_data[smiles_field])
    row["smiles"] = smiles.get_smiles(mol)
    mols.append(mol)
    rows.append(row)
  df = pd.DataFrame(rows)
  return(df, mols)
  dataframe = pd.DataFrame(rows)
  return(dataframe, mols)