Unverified Commit 41d9c7f1 authored by Nathan Frey's avatar Nathan Frey Committed by GitHub
Browse files

Merge pull request #2406 from ncfrey/atomicconvfix

[WIP] AtomicConvFeaturizer test fixes
parents 6cf77a92 38e7a470
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@ from deepchem.feat.molecule_featurizers import SmilesToSeq, create_char_to_idx
from deepchem.feat.complex_featurizers import RdkitGridFeaturizer
from deepchem.feat.complex_featurizers import NeighborListAtomicCoordinates
from deepchem.feat.complex_featurizers import NeighborListComplexAtomicCoordinates
from deepchem.feat.complex_featurizers import AtomicConvFeaturizer
from deepchem.feat.complex_featurizers import (
    ComplexNeighborListFragmentAtomicCoordinates,)
from deepchem.feat.complex_featurizers import ContactCircularFingerprint
+1 −0
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ Featurizers for complex.
from deepchem.feat.complex_featurizers.rdkit_grid_featurizer import RdkitGridFeaturizer
from deepchem.feat.complex_featurizers.complex_atomic_coordinates import NeighborListAtomicCoordinates
from deepchem.feat.complex_featurizers.complex_atomic_coordinates import NeighborListComplexAtomicCoordinates
from deepchem.feat.complex_featurizers.complex_atomic_coordinates import AtomicConvFeaturizer
from deepchem.feat.complex_featurizers.complex_atomic_coordinates import ComplexNeighborListFragmentAtomicCoordinates
from deepchem.feat.complex_featurizers.contact_fingerprints import ContactCircularFingerprint
from deepchem.feat.complex_featurizers.contact_fingerprints import ContactCircularVoxelizer
+58 −13
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@
Atomic coordinate featurizer.
"""
import logging
import warnings

import numpy as np

@@ -134,23 +135,26 @@ class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):
    return (system_coords, system_neighbor_list)


class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
class AtomicConvFeaturizer(ComplexFeaturizer):
  """This class computes the featurization that corresponds to AtomicConvModel.

  This class computes featurizations needed for AtomicConvModel. Given a
  two molecular structures, it computes a number of useful geometric
  features. In particular, for each molecule and the global complex, it
  computes a coordinates matrix of size (N_atoms, 3) where N_atoms is the
  number of atoms. It also computes a neighbor-list, a dictionary with
  N_atoms elements where neighbor-list[i] is a list of the atoms the i-th
  atom has as neighbors. In addition, it computes a z-matrix for the
  molecule which is an array of shape (N_atoms,) that contains the atomic
  This class computes featurizations needed for AtomicConvModel.
  Given two molecular structures, it computes a number of useful
  geometric features. In particular, for each molecule and the global
  complex, it computes a coordinates matrix of size (N_atoms, 3)
  where N_atoms is the number of atoms. It also computes a
  neighbor-list, a dictionary with N_atoms elements where
  neighbor-list[i] is a list of the atoms the i-th atom has as
  neighbors. In addition, it computes a z-matrix for the molecule
  which is an array of shape (N_atoms,) that contains the atomic
  number of that atom.

  Since the featurization computes these three quantities for each of the
  two molecules and the complex, a total of 9 quantities are returned for
  each complex. Note that for efficiency, fragments of the molecules can be
  provided rather than the full molecules themselves.
  Since the featurization computes these three quantities for each of
  the two molecules and the complex, a total of 9 quantities are
  returned for each complex. Note that for efficiency, fragments of
  the molecules can be provided rather than the full molecules
  themselves.

  """

  def __init__(self,
@@ -160,6 +164,27 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
               max_num_neighbors,
               neighbor_cutoff,
               strip_hydrogens=True):
    """

    Parameters
    ----------
    frag1_num_atoms: int
      Maximum number of atoms in fragment 1.
    frag2_num_atoms: int
      Maximum number of atoms in fragment 2.
    complex_num_atoms: int
      Maximum number of atoms in complex of frag1/frag2 together.
    max_num_neighbors: int
      Maximum number of atoms considered as neighbors.
    neighbor_cutoff: float
      Maximum distance (angstroms) for two atoms to be considered as
      neighbors. If more than `max_num_neighbors` atoms fall within
      this cutoff, the closest `max_num_neighbors` will be used.
    strip_hydrogens: bool (default True)
      Remove hydrogens before computing featurization.

    """

    self.frag1_num_atoms = frag1_num_atoms
    self.frag2_num_atoms = frag2_num_atoms
    self.complex_num_atoms = complex_num_atoms
@@ -176,6 +201,7 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
          mol_pdb_file, is_protein=False, sanitize=True, add_hydrogens=False)
      frag2_coords, frag2_mol = load_molecule(
          protein_pdb_file, is_protein=True, sanitize=True, add_hydrogens=False)

    except MoleculeLoadException:
      # Currently handles loading failures by returning None
      # TODO: Is there a better handling procedure?
@@ -216,6 +242,7 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
    logging.info("Featurizing molecule of size: %d", len(mol.GetAtoms()))
    neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff,
                                          self.max_num_neighbors, None)
    # pad outputs
    z = self.get_Z_matrix(mol, max_num_atoms)
    z = pad_array(z, max_num_atoms)
    coords = pad_array(coords, (max_num_atoms, 3))
@@ -253,3 +280,21 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
    mol = MoleculeShim(atomic_numbers)
    coords = coords[indexes_to_keep]
    return coords, mol


# Deprecation warnings for old atomic conv featurizer name #

ATOMICCONV_DEPRECATION = "{} is deprecated and has been renamed to {} and will be removed in DeepChem 3.0."


class ComplexNeighborListFragmentAtomicCoordinates(AtomicConvFeaturizer):

  def __init__(self, *args, **kwargs):

    warnings.warn(
        ATOMICCONV_DEPRECATION.format(
            "ComplexNeighorListFragmentAtomicCoordinates",
            "AtomicConvFeaturizer"), FutureWarning)

    super(ComplexNeighborListFragmentAtomicCoordinates, self).__init__(
        *args, **kwargs)
+2876 −0

File added.

Preview size limit exceeded, changes collapsed.

+58 −0
Original line number Diff line number Diff line
"""
Test atomic conv featurizer.
"""

import os
import logging

import numpy as np

from deepchem.feat import AtomicConvFeaturizer

logger = logging.getLogger(__name__)


def test_atomic_conv_featurization():
  """Unit test for AtomicConvFeaturizer."""
  dir_path = os.path.dirname(os.path.realpath(__file__))
  ligand_file = os.path.join(dir_path, "data/3zso_ligand_hyd.pdb")
  protein_file = os.path.join(dir_path, "data/3zso_protein_noH.pdb")
  # Pulled from PDB files. For larger datasets with more PDBs, would use
  # max num atoms instead of exact.
  frag1_num_atoms = 44  # for ligand atoms
  frag2_num_atoms = 2334  # for protein atoms
  complex_num_atoms = 2378  # in total
  max_num_neighbors = 4
  # Cutoff in angstroms
  neighbor_cutoff = 4
  complex_featurizer = AtomicConvFeaturizer(frag1_num_atoms, frag2_num_atoms,
                                            complex_num_atoms,
                                            max_num_neighbors, neighbor_cutoff)
  (frag1_coords, frag1_neighbor_list, frag1_z, frag2_coords,
   frag2_neighbor_list, frag2_z, complex_coords, complex_neighbor_list,
   complex_z) = complex_featurizer._featurize((ligand_file, protein_file))

  # Coords are padded, neighbor list and Z are not
  assert frag1_coords.shape == (frag1_num_atoms, 3)
  assert (sorted(list(frag1_neighbor_list.keys())) == list(
      range(frag1_num_atoms)))
  assert frag1_neighbor_list[0] == [1, 2, 14, 3]
  assert frag1_z.shape == (frag1_num_atoms,)
  assert np.array_equal(
      frag1_z,
      np.array([
          6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
          6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8
      ]))

  assert frag2_coords.shape == (frag2_num_atoms, 3)
  assert (sorted(list(frag2_neighbor_list.keys())) == list(
      range(frag2_num_atoms)))
  assert frag2_neighbor_list[0] == [1, 2, 4, 3]
  assert frag2_z.shape == (frag2_num_atoms,)

  assert complex_coords.shape == (complex_num_atoms, 3)
  assert (sorted(list(complex_neighbor_list.keys())) == list(
      range(complex_num_atoms)))
  assert complex_neighbor_list[0] == [1, 2, 14, 3]
  assert (complex_z.shape == (complex_num_atoms,))
Loading