Commit c7ef90ab authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

First batch changes

parent 565fb2f9
Loading
Loading
Loading
Loading
+4 −1
Original line number Diff line number Diff line
# Data Loading Examples

The examples in this directory highlight a number of ways to
load datasets into DeepChem for downstream analysis.
load datasets into DeepChem for downstream analysis: 

- `pandas_csv.py` shows how to directly load a dataset from a CSV file without using a `DataLoader`. 
- `sdf_load.py` shows how to load a dataset from a sdf file using `SDFLoader`. 
+5 −2
Original line number Diff line number Diff line
@@ -4,14 +4,17 @@
# directly.
import pandas as pd
import deepchem as dc
from rdkit import Chem

df = pd.read_csv("example.csv")
print("Original data loaded as DataFrame:")
print(df)

featurizer = dc.feat.CircularFingerprint(size=16)
features = featurizer.featurize(df["smiles"])
dataset = dc.data.NumpyDataset(X=features, y=df["log-solubility"], ids=df["Compound ID"])
mols = [Chem.MolFromSmiles(smiles) for smiles in df["smiles"]]
features = featurizer.featurize(mols)
dataset = dc.data.NumpyDataset(
    X=features, y=df["log-solubility"], ids=df["Compound ID"])

print("Data converted into DeepChem Dataset")
print(dataset)
+6 −0
Original line number Diff line number Diff line
# This example shows how to load data from a SDF file into DeepChem. The data in this SDF file is stored in field "LogP(RRCK)"
import deepchem as dc

featurizer = dc.feat.CircularFingerprint(size=16)
loader = dc.data.SDFLoader(["LogP(RRCK)"], featurizer=featurizer, sanitize=True)
dataset = loader.featurize("membrane_permeability.sdf")
+10 −4
Original line number Diff line number Diff line
import numpy as np
import deepchem as dc

mols = ['C1=CC2=C(C=C1)C1=CC=CC=C21', 'O=C1C=CC(=O)C2=C1OC=CO2', 'C1=C[N]C=C1', 'C1=CC=CC=C[C+]1', 'C1=[C]NC=C1', 'N[C@@H](C)C(=O)O', 'N[C@H](C)C(=O)O', 'CC', 'O=C=O', 'C#N', 'CCN(CC)CC', 'CC(=O)O', 'C1CCCCC1', 'c1ccccc1']
mols = [
    'C1=CC2=C(C=C1)C1=CC=CC=C21', 'O=C1C=CC(=O)C2=C1OC=CO2', 'C1=C[N]C=C1',
    'C1=CC=CC=C[C+]1', 'C1=[C]NC=C1', 'N[C@@H](C)C(=O)O', 'N[C@H](C)C(=O)O',
    'CC', 'O=C=O', 'C#N', 'CCN(CC)CC', 'CC(=O)O', 'C1CCCCC1', 'c1ccccc1'
]
print("Original set of molecules")
print(mols)

splitter = dc.splits.ScaffoldSplitter(seed=123)
train, valid, test = splitter.train_valid_test_split(mols)
splitter = dc.splits.ScaffoldSplitter()
# TODO: This should be swapped for simpler splitter API once that's merged in.
dataset = dc.data.NumpyDataset(X=np.array(mols), ids=mols)
train, valid, test = splitter.train_valid_test_split(dataset)
# The return values are dc.data.Dataset objects so we need to extract
# the ids
print("Training set")
@@ -14,4 +21,3 @@ print("Valid set")
print(valid)
print("Test set")
print(test)
Loading