Merge pull request #556 from lilleswing/perm-dataset (a51911aa) · Commits · 钟慕尧 / deepchem

datasets/membrane_permeability.sdf

0 → 100644

+25840 −0

File added.

Preview size limit exceeded, changes collapsed.

0 → 100644

+202 −0

+8 −1

Original line number	Diff line number	Diff line
		@@ -240,9 +240,16 @@ class SDFLoader(DataLoader):
		Handles loading of SDF files.
		"""

		def __init__(self, tasks, clean_mols=False, **kwargs):
		super(SDFLoader, self).__init__(tasks, **kwargs)
		self.clean_mols = clean_mols
		self.smiles_field = "smiles"
		self.mol_field = "mol"
		self.id_field = "smiles"

		def get_shards(self, input_files, shard_size):
		"""Defines a generator which returns data for each shard"""
		return load_sdf_files(input_files)
		return load_sdf_files(input_files, self.clean_mols)

		def featurize_shard(self, shard):
		"""Featurizes a shard of an input dataframe."""

+2 −2

Original line number	Diff line number	Diff line
		@@ -67,7 +67,7 @@ def load_data(input_files, shard_size=None, verbose=True):
		yield load_pickle_from_disk(input_file)


		def load_sdf_files(input_files):
		def load_sdf_files(input_files, clean_mols):
		"""Load SDF file into dataframe."""
		dataframes = []
		for input_file in input_files:
		@@ -75,7 +75,7 @@ def load_sdf_files(input_files):
		raw_df = next(load_csv_files([input_file + ".csv"], shard_size=None))
		# Structures are stored in .sdf file
		print("Reading structures from %s." % input_file)
		suppl = Chem.SDMolSupplier(str(input_file), False, False, False)
		suppl = Chem.SDMolSupplier(str(input_file), clean_mols, False, False)
		df_rows = []
		for ind, mol in enumerate(suppl):
		if mol is not None:

+8 −5

Original line number	Diff line number	Diff line
		@@ -57,6 +57,7 @@ from qm9.qm9_datasets import load_qm9
		from sampl.sampl_datasets import load_sampl
		from clintox.clintox_datasets import load_clintox
		from hiv.hiv_datasets import load_hiv
		from membrane_permeability.membrane_permeability_datasets import load_permeability
		import xgboost


		@@ -92,7 +93,7 @@ def benchmark_loading_datasets(hyper_parameters,
		mode = 'classification'
		elif dataset in [
		'kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'qm7', 'qm7b', 'qm9',
		'sampl'
		'sampl', 'membrane_permeability'
		]:
		mode = 'regression'
		else:
		@@ -160,7 +161,8 @@ def benchmark_loading_datasets(hyper_parameters,
		'qm9': load_qm9,
		'sampl': load_sampl,
		'clintox': load_clintox,
		'hiv': load_hiv
		'hiv': load_hiv,
		'membrane_permeability': load_permeability
		}

		print('-------------------------------------')
		@@ -916,8 +918,8 @@ if __name__ == '__main__':
		dest='dataset_args',
		default=[],
		help='Choice of dataset: tox21, sider, muv, toxcast, pcba, ' +
		'kaggle, delaney, nci, pdbbind, chembl, sampl, qm7, qm7b, qm9, clintox, hiv'
		)
		'kaggle, delaney, nci, pdbbind, chembl, sampl, qm7, qm7b, qm9, clintox, hiv,'
		' membrane_permeability')
		parser.add_argument(
		'-t',
		action='store_true',
		@@ -943,7 +945,8 @@ if __name__ == '__main__':
		if len(datasets) == 0:
		datasets = [
		'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv', 'sampl',
		'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'qm7b', 'qm9'
		'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'qm7b', 'qm9',
		'membrane_permeability'
		]

		#input hyperparameters