Commit a51911aa authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #556 from lilleswing/perm-dataset

Cell Membrane Permeability Dataset
parents f2adf41a a82efab0
Loading
Loading
Loading
Loading
+25840 −0

File added.

Preview size limit exceeded, changes collapsed.

+202 −0
Original line number Diff line number Diff line
LogP(RRCK)
-5.08
-4.82
-4.89
-4.8
-4.77
-4.85
-4.96
-5.66
-4.85
-5.8
-6
-4.48
-4.55
-5.4
-4.77
-5.4
-4.89
-4.66
-5.4
-4.62
-5.7
-4.96
-5.7
-4.66
-5.3
-4.8
-5.22
-4.68
-5.52
-6
-4.72
-4.85
-4.8
-4.57
-4.64
-5.7
-4.41
-4.92
-4.96
-5.22
-6.22
-4.66
-5.01
-5.1
-6.15
-6.3
-6.22
-5.8
-5.92
-4.82
-5.15
-5.57
-5.82
-5.85
-5.4
-5.38
-6.1
-5.01
-4.96
-5.01
-5.82
-6.05
-5.6
-5.21
-5.26
-4.71
-4.52
-4.71
-4.73
-4.99
-4.56
-4.62
-4.67
-5
-4.62
-4.6
-4.66
-4.59
-4.52
-4.84
-4.8
-4.53
-4.5
-4.79
-4.85
-6.22
-5.4
-5.34
-6.1
-6.1
-5.57
-5.64
-5.51
-5.47
-5.36
-4.74
-4.03
-4.62
-4.77
-5.32
-5.15
-5.55
-5.4
-5.41
-6.09
-5.21
-6.1
-4.56
-4.51
-4.41
-5.92
-5.54
-5.29
-5.89
-4.55
-4.99
-5.85
-5.12
-5.12
-5.15
-4.44
-4.75
-4.65
-4.59
-5.62
-4.65
-4.7
-6.1
-4.54
-6.15
-5.3
-4.8
-5.25
-4.52
-6.05
-4.74
-5.47
-4.71
-4.82
-5.38
-4.89
-4.63
-4.35
-6.3
-6.05
-4.97
-4.61
-6.05
-4.63
-4.59
-4.94
-4.57
-5.82
-4.44
-4.05
-5.34
-4.62
-4.7
-4.67
-4.55
-4.49
-5.26
-4.83
-4.96
-5.41
-6.15
-4.52
-6.05
-6.1
-4.16
-4.9
-6.22
-4.53
-4.91
-5.42
-5.39
-4.51
-5.82
-6.15
-5.21
-6.4
-5.74
-6.52
-6.7
-6.15
-5.85
-5.92
-6.22
-5.72
-6.12
-5.46
-5.31
-6.3
-5.39
-5.21
-5.8
-5.33
-5.49
-5.26
-5.89
-5.24
+8 −1
Original line number Diff line number Diff line
@@ -240,9 +240,16 @@ class SDFLoader(DataLoader):
  Handles loading of SDF files.
  """

  def __init__(self, tasks, clean_mols=False, **kwargs):
    super(SDFLoader, self).__init__(tasks, **kwargs)
    self.clean_mols = clean_mols
    self.smiles_field = "smiles"
    self.mol_field = "mol"
    self.id_field = "smiles"

  def get_shards(self, input_files, shard_size):
    """Defines a generator which returns data for each shard"""
    return load_sdf_files(input_files)
    return load_sdf_files(input_files, self.clean_mols)

  def featurize_shard(self, shard):
    """Featurizes a shard of an input dataframe."""
+2 −2
Original line number Diff line number Diff line
@@ -67,7 +67,7 @@ def load_data(input_files, shard_size=None, verbose=True):
      yield load_pickle_from_disk(input_file)


def load_sdf_files(input_files):
def load_sdf_files(input_files, clean_mols):
  """Load SDF file into dataframe."""
  dataframes = []
  for input_file in input_files:
@@ -75,7 +75,7 @@ def load_sdf_files(input_files):
    raw_df = next(load_csv_files([input_file + ".csv"], shard_size=None))
    # Structures are stored in .sdf file
    print("Reading structures from %s." % input_file)
    suppl = Chem.SDMolSupplier(str(input_file), False, False, False)
    suppl = Chem.SDMolSupplier(str(input_file), clean_mols, False, False)
    df_rows = []
    for ind, mol in enumerate(suppl):
      if mol is not None:
+8 −5
Original line number Diff line number Diff line
@@ -57,6 +57,7 @@ from qm9.qm9_datasets import load_qm9
from sampl.sampl_datasets import load_sampl
from clintox.clintox_datasets import load_clintox
from hiv.hiv_datasets import load_hiv
from membrane_permeability.membrane_permeability_datasets import load_permeability
import xgboost


@@ -92,7 +93,7 @@ def benchmark_loading_datasets(hyper_parameters,
    mode = 'classification'
  elif dataset in [
      'kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'qm7', 'qm7b', 'qm9',
      'sampl'
      'sampl', 'membrane_permeability'
  ]:
    mode = 'regression'
  else:
@@ -160,7 +161,8 @@ def benchmark_loading_datasets(hyper_parameters,
      'qm9': load_qm9,
      'sampl': load_sampl,
      'clintox': load_clintox,
      'hiv': load_hiv
      'hiv': load_hiv,
      'membrane_permeability': load_permeability
  }

  print('-------------------------------------')
@@ -916,8 +918,8 @@ if __name__ == '__main__':
      dest='dataset_args',
      default=[],
      help='Choice of dataset: tox21, sider, muv, toxcast, pcba, ' +
      'kaggle, delaney, nci, pdbbind, chembl, sampl, qm7, qm7b, qm9, clintox, hiv'
  )
      'kaggle, delaney, nci, pdbbind, chembl, sampl, qm7, qm7b, qm9, clintox, hiv,'
      ' membrane_permeability')
  parser.add_argument(
      '-t',
      action='store_true',
@@ -943,7 +945,8 @@ if __name__ == '__main__':
  if len(datasets) == 0:
    datasets = [
        'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv', 'sampl',
        'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'qm7b', 'qm9'
        'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'qm7b', 'qm9',
        'membrane_permeability'
    ]

  #input hyperparameters
Loading