Unverified Commit 046ab4a0 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #939 from LRParser/master

Initial impl of mol2vec for deepchem for #894. Tested the features vs a random…
parents facf18b3 626198de
Loading
Loading
Loading
Loading
+30 −0
Original line number Diff line number Diff line
# mol2vec implementation

In the recent mol2vec [paper](https://chemrxiv.org/articles/Mol2vec_Unsupervised_Machine_Learning_Approach_with_Chemical_Intuition/5513581), authors Jaeger et al consider the features returned by the rdkit Morgan fingerprint as "words" and a compound as a "sentence" to generate fixed-length embeddings. In this case we reproduce 200-element embeddings via a download of all SDF files in the PubChem compound database

## Setup

Ensure that gensim is installed via:

```bash
pip install gensim
```

## Creating training corpus

First, download the pubchem compound SDF corpus via running:

```bash
python ../pubchem_dataset/download_pubchem_ftp.sh
```
Note - the script assumes that a /media/data/pubchem directory exists for this large download (approx 19 GB as of November 2017)

Then generate the embeddings file via:

```bash
./train_mol2vec.sh
```

Then you can use these embeddings as a fixed-length alternative to fingerprints derived directly from RDKit. A full implementation as a featurized for deepchem is WIP

Example code for using the vec.txt file that is created by the above script can be found in eval_mol2vec_results
 No newline at end of file
+29 −0
Original line number Diff line number Diff line
import gensim
from gensim import models
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import numpy as np

def main() :
    model = models.KeyedVectors.load_word2vec_format("vec.txt")
    embeddings = list()

    # Using canonical smiles for glycine, as in original research paper
    mol = Chem.MolFromSmiles("C(C(=O)O)N")
    try:
        info = {}
        rdMolDescriptors.GetMorganFingerprint(mol, 0, bitInfo=info)
        keys = info.keys()
        keys_list = list(keys)
        totalvec = np.zeros(200)
        for k in keys_list:
            wordvec = model.wv[str(k)]
            totalvec = np.add(totalvec, wordvec)
        embeddings.append(totalvec)
    except Exception as e:
        print(e)
        pass

    print(embeddings[0])

+40 −0
Original line number Diff line number Diff line
import rdkit
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import pandas as pd
import gzip
import os

def main() :

    sdf_root_path = "/media/data/pubchem/SDF"

    for path, dirs, filenames in os.walk(sdf_root_path) :
        for filename in filenames:
            filepath = os.path.join(sdf_root_path, filename)

            # This SDF file fails to parse with RDKit on Ubuntu 16.04
            if "Compound_102125001_102150000" in filename:
                continue

            with gzip.open(filepath, 'rb') as myfile:
                suppl = Chem.ForwardSDMolSupplier(myfile)

                for mol in suppl:

                    if not mol:
                        continue

                    try :
                        info = {}
                        rdMolDescriptors.GetMorganFingerprint(mol,1,bitInfo=info)
                        keys = info.keys()
                        keys_list = list(keys)
                        for k in keys_list:
                            print(k,end=' ')
                        print()
                    except Exception:
                        pass

if __name__ == "__main__" :
    main()
+4 −0
Original line number Diff line number Diff line
#! /bin/bash
python mol2vec.py > data.txt
python -m gensim.scripts.word2vec_standalone -train data.txt -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3
+87 −0
Original line number Diff line number Diff line
import pandas as pd
import os
from rdkit import Chem
import time
import gzip

def main() :
    print("Processing PubChem FTP Download")

    sdf_root_path = "/media/data/pubchem/SDF"
    results_path = "/media/data/pubchem/smiles"

    try:
        os.makedirs(results_path)
    except :
        print("Results directory already exists")

    i = 0
    max_smiles_len = 200

    processed_files = os.listdir(results_path)
    processed_files.append("Compound_102125001_102150000_smiles.csv")

    for path, dirs, filenames in os.walk(sdf_root_path) :
        for filename in filenames:

            print("Processing: {0}".format(filename))

            expected_file_name = filename.replace(".sdf.gz", "_smiles.csv")
            new_file_name = os.path.join(results_path,expected_file_name)


            if expected_file_name in processed_files:
                print("Skipping: {0}".format(new_file_name))
                i = i + 1
                continue

            keys = list()
            values = list()

            start = time.time()
            filepath = os.path.join(sdf_root_path,filename)

            with gzip.open(filepath,'rb') as myfile:
                suppl = Chem.ForwardSDMolSupplier(myfile)
                for mol in suppl:
                    if mol is None: continue
                    cid = mol.GetProp("PUBCHEM_COMPOUND_CID")
                    smiles = mol.GetProp("PUBCHEM_OPENEYE_ISO_SMILES")
                    if len(smiles) > max_smiles_len:
                        i = i + 1
                        print("Skipped compound: {0} due to large size".format(cid))
                        continue
                    keys.append(int(cid))
                    values.append(smiles)
                end = time.time()

                print("Processed file number: {0} in {1} seconds".format(i, end - start))
                i = i + 1

                df = pd.DataFrame({"PUBCHEM_CID" : keys, "SMILES" : values},index=keys)
                df.to_csv(new_file_name,index=False)



    # Now parse all results smile files into one big file
    df_list = list()
    processed_files = os.listdir(results_path)
    for filename in processed_files :
        print("Processing: {} for summary CSV".format(filename))
        df = pd.read_csv(os.path.join(results_path,filename))
        df_list.append(df)

    df_full = pd.concat(df_list)


    print("Writing out summary CSV")
    df_full.to_csv("/media/data/pubchem/summary.csv",index=False)

    print("Stored data as CSV")



    print("Done")

if __name__ == '__main__':
    main()
Loading