Grab-bag of changes (1474bc05) · Commits · 钟慕尧 / deepchem

deepchem/datasets/init.py

+7 −5

Original line number	Diff line number	Diff line
		@@ -116,6 +116,12 @@ class Dataset(object):
		out_ids = "%s-ids.joblib" % basename

		if X is not None:
		############################################## DEBUG
		print("X.shape")
		print(X.shape)
		print("os.path.join(data_dir, out_X)")
		print(os.path.join(data_dir, out_X))
		############################################## DEBUG
		save_to_disk(X, os.path.join(data_dir, out_X))
		save_to_disk(X, os.path.join(data_dir, out_X_transformed))
		X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
		@@ -221,11 +227,9 @@ class Dataset(object):
		def from_numpy(data_dir, X, y, w=None, ids=None, tasks=None):
		n_samples = len(X)
		# The -1 indicates that y will be reshaped to have length -1
		######################################################### DEBUG
		if n_samples > 0:
		y = np.reshape(y, (n_samples, -1))
		######################################################### DEBUG
		#y = np.reshape(y, (n_samples, -1))
		w = np.reshape(w, (n_samples, -1))
		n_tasks = y.shape[1]
		if ids is None:
		ids = np.arange(n_samples)
		@@ -238,11 +242,9 @@ class Dataset(object):

		def select(self, select_dir, indices):
		"""Creates a new dataset from a selection of indices from self."""
		################################################### DEBUG
		indices = np.array(indices).astype(int)
		X, y, w, ids = self.to_numpy()
		tasks = self.get_task_names()
		################################################### DEBUG
		X_sel, y_sel, w_sel, ids_sel = (
		X[indices], y[indices], w[indices], ids[indices])
		return Dataset.from_numpy(select_dir, X_sel, y_sel, w_sel, ids_sel, tasks)

deepchem/featurizers/featurize.py

+58 −56

Original line number	Diff line number	Diff line
		@@ -17,7 +17,6 @@ from rdkit import Chem
		from deepchem.utils.save import log
		from deepchem.utils.save import save_to_disk
		from deepchem.utils.save import load_from_disk
		from deepchem.utils.save import load_pandas_from_disk
		from deepchem.featurizers import Featurizer, ComplexFeaturizer
		from deepchem.featurizers import UserDefinedFeaturizer
		from deepchem.datasets import Dataset
		@@ -38,14 +37,18 @@ def _process_field(val):
		else:
		raise ValueError("Field of unrecognized type: %s" % str(val))

		def load_data(input_file):
		"""Loads data from disk."""
		def load_data(input_file, shard_size=None):
		"""Loads data from disk.

		For CSV files, supports sharded loading for large files.
		"""
		input_type = _get_input_type(input_file)
		if input_type == "sdf":
		raw_df = _load_sdf_file(input_file)
		if shard_size is not None:
		raise ValueError("shard_size must be None for sdf input.")
		return _load_sdf_file(input_file)
		else:
		raw_df = _load_csv_file(input_file)
		return raw_df
		return _load_csv_file(input_file, shard_size)

		def _load_sdf_file(input_file):
		"""Load SDF file into dataframe."""
		@@ -61,12 +64,12 @@ def _load_sdf_file(input_file):
		df_rows.append([ind,smiles,mol])
		mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol'))
		raw_df = pd.concat([mol_df, raw_df], axis=1, join='inner')
		return raw_df
		return [raw_df]

		def _load_csv_file(input_file):
		"""Loads CSV file into dataframe."""
		raw_df = load_pandas_from_disk(input_file)
		return raw_df
		def _load_csv_file(filename, shard_size=None):
		"""Load data as pandas dataframe."""
		# First line of user-specified CSV must be header.
		return pd.read_csv(filename, header=0, chunksize=shard_size)

		def _get_input_type(input_file):
		"""Get type of input file. Must be csv/pkl.gz/sdf file."""
		@@ -115,7 +118,7 @@ class DataFeaturizer(object):
		dataframe object to disk as output.
		"""

		def __init__(self, tasks, smiles_field,
		def __init__(self, tasks, smiles_field=None,
		id_field=None, threshold=None,
		protein_pdb_field=None, ligand_pdb_field=None,
		ligand_mol2_field=None, mol_field=None,
		@@ -148,61 +151,59 @@ class DataFeaturizer(object):
		"""Featurize provided file and write to specified location."""
		log("Loading raw samples now.", self.verbosity)

		raw_df = load_data(input_file)
		fields = raw_df.keys()
		log("Loaded raw data frame from file.", self.verbosity)
		log("About to preprocess samples.", self.verbosity)

		if not os.path.exists(data_dir):
		os.makedirs(data_dir)

		def process_raw_sample_helper(row, fields, input_type):
		return self._process_raw_sample(input_type, row, fields)
		# Construct partial function to write datasets.
		write_fn = partial(
		Dataset.write_dataframe, data_dir=data_dir,
		featurizers=self.featurizers, tasks=self.tasks)
		input_type = _get_input_type(input_file)
		process_raw_sample_helper_partial = partial(process_raw_sample_helper,
		fields=fields,
		input_type=input_type)

		metadata_rows = []
		for shard_num, raw_df_shard in enumerate(load_data(input_file, shard_size)):
		log("Loaded shard %d of size %d from file." % (shard_num+1, shard_size),
		self.verbosity)
		log("About to featurize shard.", self.verbosity)

		nb_sample = raw_df.shape[0]
		interval_points = np.linspace(
		0, nb_sample, np.ceil(float(nb_sample)/shard_size)+1, dtype=int)
		def process_helper(row, fields, input_type):
		return self._process_raw_sample(input_type, row, fields)
		process_fn = partial(process_helper, fields=raw_df_shard.keys(),
		input_type=input_type)

		metadata_rows = []
		# Construct partial function to write datasets.
		write_dataframe_partial = partial(
		Dataset.write_dataframe, data_dir=data_dir,
		featurizers=self.featurizers, tasks=self.tasks)
		metadata_rows.append(self._featurize_shard(
		raw_df_shard, process_fn, write_fn, shard_num, input_type))

		for j in range(len(interval_points)-1):
		log("Sharding and standardizing into shard-%s / %s shards"
		% (str(j+1), len(interval_points)-1), self.verbosity)
		raw_df_shard = raw_df.iloc[range(interval_points[j], interval_points[j+1])]
		raw_df_shard = raw_df_shard.apply(
		process_raw_sample_helper_partial, axis=1, reduce=False)
		# TODO(rbharath): This whole bit with metadata_rows is an awkward way of
		# creating a Dataset. Is there a more elegant solutions?
		dataset = Dataset(data_dir=data_dir,
		metadata_rows=metadata_rows,
		reload=reload, verbosity=self.verbosity)
		return dataset

		df = self._standardize_df(raw_df_shard)
		def _featurize_shard(self, raw_df_shard, process_fn, write_fn, shard_num, input_type):
		"""Featurizes a shard of an input dataframe."""
		log("Applying processing transformation to shard.",
		self.verbosity)
		raw_df_shard = raw_df_shard.apply(
		process_fn, axis=1, reduce=False)
		log("About to standardize dataframe.")
		df_shard = self._standardize_df(raw_df_shard)

		field = "mol" if input_type == "sdf" else "smiles"
		for featurizer in self.featurizers:
		log("Currently featurizing feature_type: %s"
		% featurizer.__class__.__name__, self.verbosity)
		if isinstance(featurizer, UserDefinedFeaturizer):
		self._add_user_specified_features(df, featurizer)
		self._add_user_specified_features(df_shard, featurizer)
		elif isinstance(featurizer, Featurizer):
		self._featurize_mol(df, featurizer, field=field,
		self._featurize_mol(df_shard, featurizer, field=field,
		worker_pool=worker_pool)
		elif isinstance(featurizer, ComplexFeaturizer):
		self._featurize_complexes(df, featurizer,
		self._featurize_complexes(df_shard, featurizer,
		worker_pool=worker_pool)
		basename = "shard-%d" % j
		metadata_rows.append(write_dataframe_partial((basename, df)))

		dataset = Dataset(data_dir=data_dir,
		metadata_rows=metadata_rows,
		reload=reload, verbosity=self.verbosity)

		return dataset
		basename = "shard-%d" % shard_num
		return write_fn((basename, df_shard))

		def _shard_files_exist(self, feature_dir):
		"""Checks if data shard files already exist."""
		@@ -239,6 +240,7 @@ class DataFeaturizer(object):
		"""
		df = pd.DataFrame(ori_df[[self.id_field]])
		df.columns = ["mol_id"]
		if self.smiles_field is not None:
		df["smiles"] = ori_df[[self.smiles_field]]
		for task in self.tasks:
		df[task] = ori_df[[task]]

deepchem/featurizers/grid_featurizer.py

+92 −79

Original line number	Diff line number	Diff line
		@@ -22,26 +22,24 @@ import shutil
		import multiprocessing as mp


		'''
		"""
		http://stackoverflow.com/questions/38987/how-can-i-merge-two-python-dictionaries-in-a-single-expression
		'''


		"""
		def _merge_two_dicts(x, y):
		'''Given two dicts, merge them into a new dict as a shallow copy.'''
		"""Given two dicts, merge them into a new dict as a shallow copy."""
		z = x.copy()
		z.update(y)
		return z

		def _compute_centroid(coordinates):
		'''given molecule, an instance of class PDB, compute the x,y,z centroid of that molecule'''
		"""given molecule, an instance of class PDB, compute the x,y,z centroid of that molecule"""

		centroid = np.mean(coordinates, axis=0)
		return(centroid)


		def _generate_random__unit_vector():
		'''generate a random unit vector on the 3-sphere
		"""generate a random unit vector on the 3-sphere
		citation:
		http://mathworld.wolfram.com/SpherePointPicking.html

		@@ -49,7 +47,7 @@ def _generate_random__unit_vector():
		b. Choose random z \element [-1, 1]
		c. Compute output: (x,y,z) = (sqrt(1-z^2)cos(theta), sqrt(1-z^2)sin(theta),z)
		d. output u
		'''
		"""

		theta = np.random.uniform(low=0.0, high=2 * np.pi)
		z = np.random.uniform(low=-1.0, high=1.0)
		@@ -59,7 +57,7 @@ def _generate_random__unit_vector():


		def _generate_random_rotation_matrix():
		'''
		"""
		1. generate a random unit vector, i.e., randomly sampled from the unit 3-sphere
		a. see function _generate_random__unit_vector() for details
		2. Generate a second random unit vector thru the algorithm in (1), output v
		@@ -67,13 +65,13 @@ def _generate_random_rotation_matrix():
		(intuition: we want them to be as linearly independent as possible or else the
		orthogonalized version of v will be much shorter in magnitude compared to u. I assume
		in Stack they took this from Gram-Schmidt orthogonalization?)
		b. v' = v - (u \dot v)*u, i.e. subtract out the component of v that's in u's direction
		c. normalize v' (this isn't in Stack but I assume it must be done)
		3. find w = u \cross v'
		4. u, v', and w will form the columns of a rotation matrix, R. The intuition is that
		u, v' and w are, respectively, what the standard basis vectors e1, e2, and e3 will be
		b. v" = v - (u \dot v)*u, i.e. subtract out the component of v that"s in u"s direction
		c. normalize v" (this isn"t in Stack but I assume it must be done)
		3. find w = u \cross v"
		4. u, v", and w will form the columns of a rotation matrix, R. The intuition is that
		u, v" and w are, respectively, what the standard basis vectors e1, e2, and e3 will be
		mapped to under the transformation.
		'''
		"""

		u = _generate_random__unit_vector()
		v = _generate_random__unit_vector()
		@@ -90,7 +88,7 @@ def _generate_random_rotation_matrix():


		def _rotate_molecules(mol_coordinates_list):
		'''
		"""
		Pseudocode:
		1. Generate random rotation matrix. This matrix applies a random transformation to any
		3-vector such that, were the random transformation repeatedly applied, it would randomly
		@@ -98,7 +96,7 @@ def _rotate_molecules(mol_coordinates_list):
		cf. _generate_random_rotation_matrix() for details
		2. Apply R to all atomic coordinatse.
		3. Return rotated molecule
		'''
		"""
		R = _generate_random_rotation_matrix()
		rotated_coordinates_list = []

		@@ -112,12 +110,12 @@ def _rotate_molecules(mol_coordinates_list):


		def _reflect_molecule(mol_coordinates_list):
		'''
		"""
		Pseudocode:
		1. Generate unit vector that is randomly distributed around 3-sphere
		2. For each atom, project its coordinates onto the random unit vector from (1),
		and subtract twice the projection from the original coordinates to obtain its reflection
		'''
		"""

		molecule = deepcopy(mol)
		a = _generate_random__unit_vector()
		@@ -132,12 +130,12 @@ def _reflect_molecule(mol_coordinates_list):


		def _compute_pairwise_distances(protein_xyz, ligand_xyz):
		'''
		"""
		Takes an input m x 3 and n x 3 np arrays of 3d coords of protein and ligand,
		respectively, and outputs an m x n np array of pairwise distances in Angstroms
		between protein and ligand atoms. entry (i,j) is dist between the i'th protein
		atom and the j'th ligand atom
		'''
		between protein and ligand atoms. entry (i,j) is dist between the i"th protein
		atom and the j"th ligand atom
		"""

		pairwise_distances = np.zeros(
		(np.shape(protein_xyz)[0], np.shape(ligand_xyz)[0]))
		@@ -148,9 +146,9 @@ def _compute_pairwise_distances(protein_xyz, ligand_xyz):

		return(pairwise_distances)

		'''following two functions adapted from:
		"""following two functions adapted from:
		http://stackoverflow.com/questions/2827393/angles-between-two-n-dimensional-vectors-in-python
		'''
		"""


		def _unit_vector(vector):
		@@ -159,7 +157,7 @@ def _unit_vector(vector):


		def _angle_between(vector_i, vector_j):
		""" Returns the angle in radians between vectors 'vector_i' and 'vector_j'::
		""" Returns the angle in radians between vectors "vector_i" and "vector_j"::

		>>> _angle_between((1, 0, 0), (0, 1, 0))
		1.5707963267948966
		@@ -180,10 +178,10 @@ def _angle_between(vector_i, vector_j):


		def _bfs(mol, startatom, D):
		'''
		"""
		given openbabel molecule and a starting atom of type OBAtom,
		finds all bonds out to degree D via a breath-first search
		'''
		"""

		visited_atoms = set()
		atoms_to_add = []
		@@ -206,11 +204,11 @@ def _bfs(mol, startatom, D):


		def _construct_fragment_from_bonds(bonds):
		'''
		"""
		takes as input a list of bonds of type OBBond and constructs a new
		openbabel molecule from those bonds and the atoms that constitute
		the start and end of those bonds.
		'''
		"""

		fragment = ob.OBMol()
		added_atoms = []
		@@ -243,14 +241,14 @@ def _construct_fragment_from_bonds(bonds):


		def _compute_ecfp(system_ob, start_atom, max_degree=2):
		'''
		"""
		Given an openbabel molecule and a starting atom (OBAtom object),
		compute the ECFP[max_degree]-like representation for that atom.
		Returns (for now) a SMILES string representing the resulting fragment.

		TODO(enf): Optimize this! Try InChi key and other approaches
		to improving this representation.
		'''
		"""

		fragment = ob.OBMol()

		@@ -265,10 +263,10 @@ def _hash_sybyl(sybyl, sybyl_types):
		return(sybyl_types.index(sybyl))

		def _hash_ecfp(ecfp, power):
		'''
		"""
		Returns an int of size 2^power representing that
		ECFP fragment. Input must be a string.
		'''
		"""

		md5 = hashlib.md5()
		md5.update(ecfp)
		@@ -278,10 +276,10 @@ def _hash_ecfp(ecfp, power):


		def _hash_ecfp_pair(ecfp_pair, power):
		'''
		"""
		Returns an int of size 2^power representing that
		ECFP pair. Input must be a tuple of strings.
		'''
		"""

		ecfp = "%s,%s" % (ecfp_pair[0], ecfp_pair[1])
		md5 = hashlib.md5()
		@@ -292,12 +290,12 @@ def _hash_ecfp_pair(ecfp_pair, power):


		def _compute_all_ecfp(system_ob, indices=None, degree=2):
		'''
		"""
		For each atom:
		Obtain molecular fragment for all atoms emanating outward to given degree.
		For each fragment, compute SMILES string (for now) and hash to an int.
		Return a dictionary mapping atom index to hashed SMILES.
		'''
		"""

		ecfp_dict = {}

		@@ -311,12 +309,12 @@ def _compute_all_ecfp(system_ob, indices=None, degree=2):


		def _compute_ecfp_features(system_ob, ecfp_degree, ecfp_power):
		'''
		"""
		Takes as input an openbabel molecule, ECFP radius, and number of bits to store
		ECFP features (2^ecfp_power will be length of ECFP array);
		Returns an array of size 2^ecfp_power where array at index i has a 1 if that ECFP fragment
		is found in the molecule and array at index j has a 0 if ECFP fragment not in molecule.
		'''
		"""

		ecfp_dict = _compute_all_ecfp(system_ob, degree=ecfp_degree)
		ecfp_vec = [_hash_ecfp(ecfp, ecfp_power)
		@@ -328,9 +326,9 @@ def _compute_ecfp_features(system_ob, ecfp_degree, ecfp_power):

		def _featurize_binding_pocket_ecfp(protein_xyz, protein, ligand_xyz, ligand,
		pairwise_distances=None, cutoff=4.5, ecfp_degree=2):
		'''
		"""
		Computes ECFP dicts for both the ligand and the binding pocket region of the protein.
		'''
		"""

		features_dict = {}

		@@ -373,11 +371,11 @@ def _featurize_binding_pocket_sybyl(protein_xyz, protein, ligand_xyz, ligand,

		def _compute_splif_features_in_range(protein, ligand, pairwise_distances, contact_bin,
		ecfp_degree=2):
		'''
		"""
		Find all protein atoms that are > contact_bin[0] and < contact_bin[1] away from ligand atoms.
		Then, finds the ECFP fingerprints for the contacting atoms.
		Returns a dictionary mapping (protein_index_i, ligand_index_j) --> (protein_ecfp_i, ligand_ecfp_j)
		'''
		"""
		contacts = np.nonzero(
		(pairwise_distances > contact_bin[0]) & (
		pairwise_distances < contact_bin[1]))
		@@ -397,11 +395,11 @@ def _compute_splif_features_in_range(protein, ligand, pairwise_distances, contac

		def _featurize_splif(protein_xyz, protein, ligand_xyz, ligand, contact_bins, pairwise_distances,
		ecfp_degree):
		'''
		"""
		For each contact range (i.e. 1 A to 2 A, 2 A to 3 A, etc.) compute a dictionary mapping
		(protein_index_i, ligand_index_j) tuples --> (protein_ecfp_i, ligand_ecfp_j) tuples.
		return a list of such splif dictionaries.
		'''
		"""

		if pairwise_distances is None:
		pairwise_distances = _compute_pairwise_distances(
		@@ -478,7 +476,7 @@ def _update_feature_dict(feature_dict, idxs=None, indices=None):
		def _compute_pi_stack(protein_xyz, protein, ligand_xyz, ligand,
		pairwise_distances=None, dist_cutoff=4.4,
		angle_cutoff=30.):
		'''
		"""
		Pseudocode:

		for each ring in ligand:
		@@ -493,7 +491,7 @@ def _compute_pi_stack(protein_xyz, protein, ligand_xyz, ligand,
		if it counts as pi-T:
		for each atom in ligand and in protein:
		add to list of atom indices
		'''
		"""
		protein_pi_parallel = {}
		protein_pi_t = {}
		ligand_pi_parallel = {}
		@@ -553,7 +551,7 @@ def _compute_cation_pi(protein, ligand, protein_cation_pi, ligand_cation_pi):
		protein_ring_center = _compute_ring_center(protein, protein_ring)
		protein_ring_normal = _compute_ring_normal(protein, protein_ring)
		for atom in ob.OBMolAtomIter(ligand):
		if np.abs(atom.GetFormalCharge() - 1.0) < 0.01 or '+' in atom.GetType():
		if np.abs(atom.GetFormalCharge() - 1.0) < 0.01 or "+" in atom.GetType():
		cation_position = np.array([atom.x(), atom.y(), atom.z()])
		if _is_cation_pi(cation_position, protein_ring_center, protein_ring_normal):
		protein_cation_pi = _update_feature_dict(protein_cation_pi,
		@@ -573,9 +571,9 @@ def _compute_binding_pocket_cation_pi(protein_xyz, protein, ligand_xyz, ligand):
		return (protein_cation_pi, ligand_cation_pi)

		def _get_formal_charge(atom):
		if '+' in atom.GetType() or np.abs(1.0-atom.GetFormalCharge()) < 0.01:
		if "+" in atom.GetType() or np.abs(1.0-atom.GetFormalCharge()) < 0.01:
		return 1
		elif '-' in atom.GetType() or np.abs(-1.0-atom.GetFormalCharge()) < 0.01:
		elif "-" in atom.GetType() or np.abs(-1.0-atom.GetFormalCharge()) < 0.01:
		return -1
		else:
		return atom.GetFormalCharge()
		@@ -605,10 +603,10 @@ def _is_angle_within_cutoff(vector_i, vector_j, hbond_angle_cutoff):

		def _is_hydrogen_bond(protein_xyz, protein, ligand_xyz,
		ligand, contact, hbond_angle_cutoff):
		'''
		"""
		Determine if a pair of atoms (contact = tuple of protein_atom_index, ligand_atom_index)
		between protein and ligand represents a hydrogen bond. Returns a boolean result.
		'''
		"""

		protein_atom_index = contact[0]
		ligand_atom_index = contact[1]
		@@ -638,10 +636,10 @@ def _is_hydrogen_bond(protein_xyz, protein, ligand_xyz,
		def _compute_hbonds_in_range(protein, protein_xyz, ligand, ligand_xyz,
		pairwise_distances, hbond_dist_bin,
		hbond_angle_cutoff, ecfp_degree):
		'''
		"""
		Find all pairs of (protein_index_i, ligand_index_j) that hydrogen bond given
		a distance bin and an angle cutoff.
		'''
		"""

		contacts = np.nonzero(
		(pairwise_distances > hbond_dist_bin[0]) & (
		@@ -662,10 +660,10 @@ def _compute_hbonds_in_range(protein, protein_xyz, ligand, ligand_xyz,
		def _compute_hydrogen_bonds(protein_xyz, protein, ligand_xyz, ligand,
		pairwise_distances, hbond_dist_bins,
		hbond_angle_cutoffs, ecfp_degree):
		'''
		"""
		Returns a list of sublists. Each sublist is a series of tuples of (protein_index_i, ligand_index_j)
		that represent a hydrogen bond. Each sublist represents a different type of hydrogen bond.
		'''
		"""

		hbond_contacts = []
		for i, hbond_dist_bin in enumerate(hbond_dist_bins):
		@@ -678,9 +676,9 @@ def _compute_hydrogen_bonds(protein_xyz, protein, ligand_xyz, ligand,


		def _convert_atom_to_voxel(molecule_xyz, atom_index, box_width, voxel_width):
		'''
		"""
		Converts an atom to an i,j,k grid index.
		'''
		"""
		coordinates = molecule_xyz[atom_index, :]
		indices = np.floor(np.abs(molecule_xyz[atom_index, :] +
		np.array([box_width, box_width, box_width]) / 2.0)
		@@ -690,9 +688,9 @@ def _convert_atom_to_voxel(molecule_xyz, atom_index, box_width, voxel_width):

		def _convert_atom_pair_to_voxel(
		molecule_xyz_tuple, atom_index_pair, box_width, voxel_width):
		'''
		"""
		Converts a pair of atoms to a list of i,j,k tuples.
		'''
		"""
		indices_list = []
		indices_list.append(_convert_atom_to_voxel(molecule_xyz_tuple[0],
		atom_index_pair[0], box_width, voxel_width)[0])
		@@ -701,10 +699,10 @@ def _convert_atom_pair_to_voxel(
		return(indices_list)

		def _merge_molecules(protein_xyz, protein, ligand_xyz, ligand):
		'''
		"""
		Takes as input protein and ligand objects of class PDB and adds ligand atoms to the protein,
		and returns the new instance of class PDB called system that contains both sets of atoms.
		'''
		"""

		system_xyz = np.array(np.vstack(np.vstack((protein_xyz, ligand_xyz))))
		system_ob = ob.OBMol(protein_ob)
		@@ -719,9 +717,9 @@ def _compute_charge_dictionary(molecule):
		return charge_dictionary

		def _subtract_centroid(xyz, centroid):
		'''
		"""
		subtracts the centroid, a numpy array of dim 3, from all coordinates of all atoms in the molecule
		'''
		"""

		xyz -= np.transpose(centroid)
		return(xyz)
		@@ -805,21 +803,28 @@ class GridFeaturizer(ComplexFeaturizer):
		return features

		def _transform(self, protein_pdb, ligand_pdb):
		'''Takes as input files (strings) for pdb of the protein, pdb of the ligand, and a directory
		"""Takes as input files (strings) for pdb of the protein, pdb of the ligand, and a directory
		to save intermediate files.

		This function then computes the centroid of the ligand; decrements this centroid from the atomic coordinates of protein and
		ligand atoms, and then merges the translated protein and ligand. This combined system/complex is then saved.
		This function then computes the centroid of the ligand; decrements this
		centroid from the atomic coordinates of protein and ligand atoms, and then
		merges the translated protein and ligand. This combined system/complex is then
		saved.

		This function then computes a featurization with scheme specified by the user.
		'''
		"""
		a = time.time()
		protein_name = str(protein_pdb).split(
		"/")[len(str(protein_pdb).split("/")) - 2]

		#if not self.ligand_only:
		# protein_xyz, protein_ob = self._load_molecule(protein_pdb, calc_charges=True)
		#ligand_xyz, ligand_ob = self._load_molecule(ligand_pdb, calc_charges=True)
		#################################################### DEBUG
		if not self.ligand_only:
		protein_xyz, protein_ob = self._load_molecule(protein_pdb, calc_charges=True)
		ligand_xyz, ligand_ob = self._load_molecule(ligand_pdb, calc_charges=True)
		protein_xyz, protein_ob = self._load_molecule(protein_pdb, calc_charges=False)
		ligand_xyz, ligand_ob = self._load_molecule(ligand_pdb, calc_charges=False)
		#################################################### DEBUG



		@@ -1035,10 +1040,10 @@ class GridFeaturizer(ComplexFeaturizer):
		return feature_vector

		def _get_xyz_from_ob(self, ob_mol):
		'''
		"""
		returns an m x 3 np array of 3d coords
		of given openbabel molecule
		'''
		"""

		xyz = np.zeros((ob_mol.NumAtoms(), 3))
		for i, atom in enumerate(ob.OBMolAtomIter(ob_mol)):
		@@ -1048,17 +1053,16 @@ class GridFeaturizer(ComplexFeaturizer):
		return(xyz)

		def _load_molecule(self, molecule_file, remove_hydrogens=True, calc_charges=False):
		'''
		"""
		given molecule_file, returns a tuple of xyz coords of molecule
		and an openbabel object representing that molecule
		'''
		"""

		if ".mol2" in molecule_file:
		obConversion = ob.OBConversion()
		obConversion.SetInAndOutFormats(str("mol2"), str("mol2"))
		ob_mol = ob.OBMol()
		obConversion.ReadFile(ob_mol, molecule_file)

		else:
		obConversion = ob.OBConversion()
		obConversion.SetInAndOutFormats(str("pdb"), str("pdb"))
		@@ -1067,6 +1071,14 @@ class GridFeaturizer(ComplexFeaturizer):

		if calc_charges:
		gasteiger = ob.OBChargeModel.FindType(str("gasteiger"))
		###################################### DEBUG
		print("molecule_file")
		print(molecule_file)
		print("ob_mol is None")
		print(ob_mol is None)
		print("gasteiger is None")
		print(gasteiger is None)
		###################################### DEBUG
		gasteiger.ComputeCharges(ob_mol)
		ob_mol.UnsetImplicitValencePerceived()
		ob_mol.CorrectForPH(7.4)
		@@ -1079,9 +1091,10 @@ class GridFeaturizer(ComplexFeaturizer):
		return xyz, ob_mol

		def _generate_box(self, mol):
		'''
		generate_box takes as input a molecule of class PDB and removes all atoms outside of the given box dims
		'''
		"""
		Generate_box takes as input a molecule of class PDB and removes all atoms
		outside of the given box dims
		"""

		molecule = deepcopy(mol)
		atoms_to_keep = []

deepchem/models/sklearn_models/init.py

+5 −0

Original line number	Diff line number	Diff line
		@@ -42,6 +42,11 @@ class SklearnModel(Model):
		Fits SKLearn model to data.
		"""
		X, y, w, _ = dataset.to_numpy()
		########################################### DEBUG
		print("SklearnModel.fit()")
		print("X.shape, y.shape, w.shape")
		print(X.shape, y.shape, w.shape)
		########################################### DEBUG
		y, w = np.squeeze(y), np.squeeze(w)
		# Logistic regression doesn't support weights
		if not isinstance(self.raw_model, LogisticRegression):

deepchem/scripts/featurize_pdbbind.py

+9 −7

File changed.

Preview size limit exceeded, changes collapsed.

Admin message