Unverified Commit 138522e7 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2033 from peastman/load

Optimizations to data loading
parents 0c5b2b89 624dfb1a
Loading
Loading
Loading
Loading
+14 −28
Original line number Diff line number Diff line
@@ -44,27 +44,13 @@ def _convert_df_to_numpy(df, tasks):
  n_samples = df.shape[0]
  n_tasks = len(tasks)

  time1 = time.time()
  y = np.hstack(
      [np.reshape(np.array(df[task].values), (n_samples, 1)) for task in tasks])
  time2 = time.time()

  w = np.ones((n_samples, n_tasks))
  missing = np.zeros_like(y).astype(int)
  feature_shape = None

  for ind in range(n_samples):
    for task in range(n_tasks):
      if y[ind, task] == "":
        missing[ind, task] = 1

  # ids = df[id_field].values
  # Set missing data to have weight zero
  for ind in range(n_samples):
    for task in range(n_tasks):
      if missing[ind, task]:
        y[ind, task] = 0.
        w[ind, task] = 0.
  if y.dtype.kind in ['O', 'U']:
    missing = (y == '')
    y[missing] = 0
    w[missing] = 0

  return y.astype(float), w.astype(float)

+7 −18
Original line number Diff line number Diff line
@@ -168,16 +168,8 @@ class CoulombMatrix(MolecularFeaturizer):
    rval = []
    for conf in mol.GetConformers():
      d = self.get_interatomic_distances(conf)
      m = np.zeros((n_atoms, n_atoms))
      for i in range(mol.GetNumAtoms()):
        for j in range(mol.GetNumAtoms()):
          if i == j:
            m[i, j] = 0.5 * z[i]**2.4
          elif i < j:
            m[i, j] = (z[i] * z[j]) / d[i, j]
            m[j, i] = m[i, j]
          else:
            continue
      m = np.outer(z, z) / d
      m[range(n_atoms), range(n_atoms)] = 0.5 * np.array(z)**2.4
      if self.randomize:
        for random_m in self.randomize_coulomb_matrix(m):
          random_m = pad_array(random_m, self.max_atoms)
@@ -236,12 +228,9 @@ class CoulombMatrix(MolecularFeaturizer):
    ]  # Convert AtomPositions from Angstrom to bohr (atomic units)
    d = np.zeros((n_atoms, n_atoms), dtype=float)
    for i in range(n_atoms):
      for j in range(n_atoms):
        if i < j:
      for j in range(i):
        d[i, j] = coords[i].Distance(coords[j])
        d[j, i] = d[i, j]
        else:
          continue
    return d