Unverified Commit 6e2b09e7 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1832 from deepchem/hash_utils

Adding hash utils file
parents 36251aef 8f8538f4
Loading
Loading
Loading
Loading
+88 −0
Original line number Diff line number Diff line
"""
Various utilities around hash functions.
"""
import logging
import numpy as np
import hashlib

logger = logging.getLogger(__name__)


def hash_ecfp(ecfp, size):
  """
  Returns an int < size representing given ECFP fragment.

  Input must be a string. This utility function is used for various
  ECFP based fingerprints.

  Parameters
  ----------
  ecfp: str
    String to hash. Usually an ECFP fragment.
  size: int, optional (default 1024)
    Hash to an int in range [0, size) 
  """
  ecfp = ecfp.encode('utf-8')
  md5 = hashlib.md5()
  md5.update(ecfp)
  digest = md5.hexdigest()
  ecfp_hash = int(digest, 16) % (size)
  return (ecfp_hash)


def hash_ecfp_pair(ecfp_pair, size):
  """Returns an int < size representing that ECFP pair.

  Input must be a tuple of strings. This utility is primarily used for
  spatial contact featurizers. For example, if a protein and ligand
  have close contact region, the first string could be the protein's
  fragment and the second the ligand's fragment. The pair could be
  hashed together to achieve one hash value for this contact region.

  Parameters
  ----------
  ecfp_pair: tuple
    Pair of ECFP fragment strings
  size: int, optional (default 1024)
    Hash to an int in range [0, size) 
  """
  ecfp = "%s,%s" % (ecfp_pair[0], ecfp_pair[1])
  ecfp = ecfp.encode('utf-8')
  md5 = hashlib.md5()
  md5.update(ecfp)
  digest = md5.hexdigest()
  ecfp_hash = int(digest, 16) % (size)
  return (ecfp_hash)


def vectorize(hash_function, feature_dict=None, size=1024):
  """Helper function to vectorize a spatial description from a hash.

  Hash functions are used to perform spatial featurizations in
  DeepChem. However, it's necessary to convert backwards from
  the hash function to feature vectors. This function aids in
  this conversion procedure. It creates a vector of zeros of length
  `seize`. It then loops through `feature_dict`, uses `hash_function`
  to hash the stored value to an integer in range [0, size) and bumps
  that index.

  Parameters
  ----------
  hash_function: function
    Should accept two arguments, `feature`, and `size` and
    return a hashed integer. Here `feature` is the item to
    hash, and `size` is an int. For example, if `size=1024`,
    then hashed values must fall in range `[0, 1024)`.
  feature_dict: dict
    Maps unique keys to features computed. 
  size: int, optional (default 1024)
    Length of generated bit vector
  """
  feature_vector = np.zeros(size)
  if feature_dict is not None:
    on_channels = [
        hash_function(feature, size) for key, feature in feature_dict.items()
    ]
    feature_vector[on_channels] += 1

  return feature_vector
+40 −0
Original line number Diff line number Diff line
import unittest
import numpy as np
from deepchem.utils import hash_utils


def random_string(length, chars=None):
  import string
  if chars is None:
    chars = list(string.ascii_letters + string.ascii_letters + '()[]+-.=#@/\\')
  return ''.join(np.random.choice(chars, length))


class TestHashUtils(unittest.TestCase):

  def test_hash_ecfp(self):
    for power in (2, 16, 64):
      for _ in range(10):
        string = random_string(10)
        string_hash = hash_utils.hash_ecfp(string, power)
        self.assertIsInstance(string_hash, int)
        self.assertLess(string_hash, 2**power)
        self.assertGreaterEqual(string_hash, 0)

  def test_hash_ecfp_pair(self):
    for power in (2, 16, 64):
      for _ in range(10):
        string1 = random_string(10)
        string2 = random_string(10)
        pair_hash = hash_utils.hash_ecfp_pair((string1, string2), power)
        self.assertIsInstance(pair_hash, int)
        self.assertLess(pair_hash, 2**power)
        self.assertGreaterEqual(pair_hash, 0)

  def test_vectorize(self):
    size = 16
    feature_dict = {0: "C", 1: "CC", 2: "CCC"}
    hash_function = hash_utils.hash_ecfp
    vector = hash_utils.vectorize(hash_function, feature_dict, size)
    assert vector.shape == (size,)
    assert np.count_nonzero(vector) == len(feature_dict)