Merge pull request #996 from LRParser/pubchem-changes (16d8ced0) · Commits · 钟慕尧 / deepchem

contrib/pubchem_dataset/README.md

+23 −15

Original line number	Diff line number	Diff line
		This provides a utility for generating bioassay datasets in PubChem, similar to the PCBA-128 dataset used in the original "Massively Multitask Learning" paper by Ramsunder et al 2015. The usage is as follows:
		This provides a utility for generating bioassay datasets in PubChem, similar to the pcba dataset used in the original "Massively Multitask Learning" paper by Ramsunder et al 2015. The usage is as follows:

		Before starting it is recommended to first set DEEPCHEM_DATA_DIR environment variable to a directory where you have at least 66GB+30GB of storage (for all PubChem SDFs+all Bioassay CSV) available

		@@ -11,21 +11,29 @@ python create_smiles_mapping.py

		Note: On an 8-core desktop computer as of Nov 2017 it took approximately 17 hours to execute create_smiles_mapping.py (that is, to extract the smiles from all the downloaded, gzipped SDF files from PubChem)

		Then, parametize the create_assay_overview.py script via setting the following variables. Only one boolean should be set true.
		If parse_128_only is set it will create a summary dataset based on the original 128 bioassays.
		If parse_all_ncgc is set it will create a summary dataset based on all NCGC assays available in PubChem as of Nov 2017
		The gene_symbol only needs to be set if parse_selected_gene is true. If so, it will build a results table focused on assays relevant to this gene.
		Then, parametize the create_assay_overview.py script via setting the following options:

		```python
		parse_128_only = False
		parse_all_ncgc = False
		parse_selected_gene = True
		gene_symbol = "PPARG"
		```

		Then run:
		```bash
		python create_assay_overview.py
		usage: create_assay_overview.py [-h] [-d DATASET_NAME] [-g GENE_ARG]

		Deepchem dataset builder for PCBA datasets

		optional arguments:
		-h, --help show this help message and exit
		-d DATASET_NAME Choice of dataset: pcba_128, pcba_146
		-g GENE_ARG Name of gene to create a dataset for
		```

		At the end you will have a pcba.csv.gz file in your DEEPCHEM_DATA_DIR ready for benchmarking
		No newline at end of file
		You must select either -d pcba_146, -d pcba_2475 or -g GENE_SYMBOL.

		At the end you will have a file, e.g. pcba_146.csv.gz, etc file in your DEEPCHEM_DATA_DIR ready for benchmarking

		Also, please note that the pcba_146 corresponds to the following query on PubChem Bioassay Search:

		10000[TotalSidCount] : 1000000000[TotalSidCount] AND 30[ActiveSidCount] : 1000000000[ActiveSidCount] AND 0[TargetCount] : 1[TargetCount] AND "NCGC"[Source Name] AND "small molecule"[filt] AND "doseresponse"[filt]

		This yields (as of Dec 2017) an additional 18 bioassays beyond the core 128 bioassays in PCBA-128

		pcba_2475 corresponds to:

		1[TotalSidCount] : 1000000000[TotalSidCount] AND 5[ActiveSidCount] : 10000000000[ActiveSidCount] AND 0[TargetCount] : 1[TargetCount] AND "small molecule"[filt] AND "doseresponse"[filt]
		No newline at end of file

contrib/pubchem_dataset/create_assay_overview.py

+199 −124

File changed.

Preview size limit exceeded, changes collapsed.

contrib/pubchem_dataset/create_smiles_mapping.py

+38 −38

Original line number	Diff line number	Diff line
		@@ -6,6 +6,7 @@ import gzip
		import pickle
		import deepchem


		def main():
		print("Processing PubChem FTP Download")

		@@ -13,7 +14,6 @@ def main() :
		sdf_dir = os.path.join(data_dir, "SDF")

		compound_read_count = 0
		max_smiles_len = 50
		keys = list()
		values = list()
		overall_start = time.time()
		@@ -44,9 +44,6 @@ def main() :
		cid = mol.GetProp("PUBCHEM_COMPOUND_CID")
		try:
		smiles = Chem.MolToSmiles(mol)
		if len(smiles) > max_smiles_len:
		compound_read_count =compound_read_count + 1
		continue
		keys.append(int(cid))
		values.append(smiles)
		except Exception:
		@@ -54,20 +51,23 @@ def main() :
		continue
		end = time.time()

		print("Processed file, processed thru compound number: {0} in {1} seconds".format(compound_read_count, end - start))
		print("Processed file, processed thru compound number: {0} in {1} seconds".
		format(compound_read_count, end - start))
		compound_read_count = compound_read_count + 1

		overall_end = time.time()
		secs_elapsed = overall_end - overall_start
		print("Parsed all smiles in: {0} seconds, or {1} minutes, or {2} hours".format(secs_elapsed,secs_elapsed/60,secs_elapsed/3600))

		with open("/media/data/pubchem/pubchemsmiles_tuple.pickle","wb") as f:
		print("Parsed all smiles in: {0} seconds, or {1} minutes, or {2} hours".
		format(secs_elapsed, secs_elapsed / 60, secs_elapsed / 3600))
		print("Total length of: {}".format(len(keys)))
		with open(os.path.join(data_dir, "/pubchemsmiles_tuple.pickle"), "wb") as f:
		pickle.dump((keys, values), f)
		print("Done")
		overall_end = time.time()
		secs_elapsed = overall_end - overall_start
		print("Sorted and saved smiles in: {0} seconds, or {1} minutes, or {2} hours".format(secs_elapsed, secs_elapsed / 60,
		secs_elapsed / 3600))
		print("Sorted and saved smiles in: {0} seconds, or {1} minutes, or {2} hours".
		format(secs_elapsed, secs_elapsed / 60, secs_elapsed / 3600))


		if __name__ == '__main__':
		main()

contrib/pubchem_dataset/ncgc_bioassays.txt

deleted100644 → 0

+0 −1071

Original line number	Diff line number	Diff line
		588834
		1259256
		1259255
		1259253
		1259252
		1224862
		1224861
		1224860
		1224859
		1224858
		1224857
		1224856
		1224855
		1224854
		1224853
		1159614
		1159530
		1159513
		1159512
		1159511
		1159510
		1382
		1159524
		1117306
		1117305
		1117304
		1117303
		1117302
		1117301
		1117300
		1117299
		1117298
		743383
		743382
		743381
		743380
		743379
		743373
		1117318
		1117313
		1117312
		1117308
		743172
		743167
		720664
		743195
		743187
		743171
		720555
		720550
		720549
		1053195
		1053194
		743244
		687012
		687011
		687010
		623995
		686922
		686915
		652283
		743327
		743326
		743325
		652174
		652170
		652100
		652099
		652098
		652097
		652096
		652095
		652094
		743292
		743288
		743285
		2675
		743286
		743279
		743266
		652173
		743262
		652264
		652254
		652251
		652172
		743255
		743245
		624288
		743207
		743206
		743205
		652183
		652180
		652175
		686935
		686934
		686933
		720711
		720709
		720708
		720707
		488940
		720659
		720653
		720652
		651621
		720636
		687009
		651605
		720584
		720579
		720580
		720576
		720575
		720573
		720572
		624496
		624495
		624494
		720569
		720559
		720537
		624500
		720565
		720553
		720551
		720542
		720538
		720534
		720541
		720540
		720536
		720535
		720533
		720532
		720528
		720527
		720526
		720525
		720524
		720523
		720522
		720519
		720518
		720517
		720498
		2517
		720507
		720506
		720504
		720503
		720502
		720501
		720499
		720497
		720496
		686947
		686948
		686980
		686979
		686978
		624462
		624391
		624389
		686971
		686970
		624144
		624143
		624142
		624131
		624130
		624059
		624054
		624053
		624052
		624049
		624048
		624047
		624046
		624045
		624034
		624023
		624019
		623997
		652256
		624094
		624093
		624113
		624112
		602480
		602479
		602446
		602445
		602444
		602443
		602456
		602454
		602452
		602450
		652106
		652105
		652104
		652048
		602392
		602391
		602373
		602371
		602370
		602366
		602362
		602360
		624098
		652054
		652051
		652038
		488949
		652037
		652025
		652016
		652015
		652023
		602291
		602290
		602288
		602286
		602262
		602258
		602256
		602238
		602237
		624162
		624415
		540355
		2707
		651965
		651713
		651712
		624479
		624476
		504602
		602122
		588853
		651843
		651595
		651593
		504375
		651838
		651819
		651820
		651815
		651814
		651813
		651812
		602310
		651804
		651802
		651793
		651791
		651789
		651784
		651768
		651778
		651777
		651758
		651757
		651755
		651754
		651751
		651749
		651743
		651741
		651725
		651724
		588740
		588738
		651635
		651644
		651603
		651600
		651599
		651597
		651568
		651567
		651558
		588333
		651550
		588331
		588330
		588329
		588324
		588323
		624501
		624498
		624493
		540363
		540353
		540352
		540350
		624465
		624464
		624463
		624461
		540360
		624455
		624418
		624417
		624414
		624346
		624342
		624341
		624337
		624336
		624335
		624332
		624331
		624329
		624328
		624312
		624308
		624305
		624298
		624297
		624296
		624291
		624287
		624263
		624261
		624253
		624252
		624251
		624250
		624249
		624248
		624247
		624246
		624173
		504937
		624202
		504780
		624172
		624171
		624170
		624161
		624160
		504891
		624148
		624149
		624147
		624146
		504760
		504759
		504758
		504752
		504741
		504740
		504739
		504737
		624122
		624117
		624116
		624044
		624032
		624031
		624030
		2566
		624002
		602179
		504662
		504656
		623960
		623959
		602478
		602477
		602476
		602475
		602474
		602448
		602447
		602442
		602441
		602388
		602384
		602377
		602332
		602365
		602364
		602358
		602357
		602313
		492949
		602293
		602292
		602289
		504434
		504433
		504432
		504608
		504604
		504599
		2576
		2533
		2095
		493215
		602233
		602224
		588859
		602204
		602202
		602201
		602200
		602199
		588856
		588855
		492954
		504836
		588813
		588812
		540276
		588809
		588795
		489039
		489038
		489037
		489036
		588790
		588722
		492960
		488913
		488908
		488900
		588591
		588590
		588579
		588547
		588546
		588545
		588544
		588543
		588541
		588537
		588536
		588535
		588534
		588533
		588532
		588527
		588526
		588516
		588515
		588514
		588513
		588463
		588456
		588453
		588382
		588379
		504386
		504373
		504371
		588397
		588378
		504613
		504537
		588349
		588347
		588346
		588344
		588342
		1997
		504389
		504387
		504385
		504384
		504380
		504379
		504377
		504376
		504372
		504366
		504362
		540327
		540362
		540361
		540357
		504749
		540325
		540317
		540303
		504834
		504832
		540256
		540280
		540279
		540277
		540275
		540267
		540263
		504894
		540253
		504585
		504895
		504865
		504812
		504850
		504810
		504848
		504847
		504845
		504844
		504842
		504841
		492989
		504809
		504806
		504779
		504778
		504777
		504748
		504731
		504624
		504618
		504603
		504595
		504706
		2781
		504686
		504681
		485295
		504647
		504660
		1484
		504652
		504651
		504648
		488854
		488851
		488850
		488849
		488845
		488838
		488832
		488821
		504637
		504636
		504548
		504629
		2407
		504611
		504610
		504606
		504581
		504547
		504536
		504534
		504374
		504370
		1477
		504474
		504467
		504466
		504465
		1623
		504444
		504320
		504318
		504316
		504315
		504314
		504430
		504429
		504428
		504427
		504424
		2114
		504421
		2663
		2537
		881
		2512
		1482
		2573
		2565
		2564
		504364
		504339
		504333
		504332
		504327
		493220
		493218
		493216
		493214
		493212
		493210
		493209
		493208
		2163
		2155
		493206
		493205
		493204
		493203
		493201
		493200
		493199
		493192
		493189
		493188
		493185
		493179
		493174
		493170
		493169
		493168
		493166
		493165
		493153
		493127
		493123
		488837
		493107
		493106
		493084
		493078
		493074
		493071
		493067
		493066
		493065
		493056
		493014
		493005
		485345
		2104
		492961
		1569
		492948
		492947
		489043
		488773
		489012
		489008
		489007
		485353
		485367
		2593
		488983
		488982
		488981
		488978
		2101
		1474
		488953
		488887
		488883
		488872
		485368
		2494
		2710
		2705
		1970
		2266
		485358
		485347
		485344
		2561
		2113
		488816
		434936
		434931
		1464
		488752
		488745
		2530
		488772
		485364
		485360
		485349
		485341
		485313
		2597
		2596
		2595
		2592
		2590
		2589
		2588
		2587
		485314
		485298
		485297
		485294
		485290
		2662
		485281
		2568
		2567
		2515
		2514
		2513
		2281
		463254
		2639
		2634
		2490
		2547
		2499
		463106
		463097
		463096
		1405
		998
		997
		959
		958
		957
		945
		944
		942
		929
		923
		912
		909
		908
		907
		906
		900
		897
		896
		892
		890
		889
		888
		886
		875
		1519
		1379
		995
		994
		993
		989
		988
		987
		986
		985
		984
		983
		982
		981
		980
		979
		978
		977
		976
		975
		974
		973
		972
		971
		970
		969
		968
		967
		966
		965
		964
		963
		962
		961
		960
		955
		948
		947
		946
		943
		939
		938
		934
		933
		931
		930
		928
		926
		925
		924
		922
		921
		918
		917
		916
		915
		914
		910
		904
		903
		902
		899
		895
		893
		891
		887
		885
		884
		883
		1475
		1742
		2783
		2782
		2778
		2326
		2562
		2552
		2487
		2479
		2455
		2449
		2447
		2444
		2389
		2162
		2795
		2794
		2787
		2786
		2785
		2584
		2472
		2451
		1485
		2763
		2762
		2741
		2737
		2734
		2733
		2730
		2729
		2115
		2111
		2110
		2109
		2100
		2713
		2712
		2711
		2708
		2701
		2688
		2687
		2685
		2680
		2677
		2676
		2673
		2672
		2671
		2669
		2668
		2667
		2666
		2660
		2641
		1491
		1489
		2613
		2585
		2572
		1721
		2559
		2551
		2549
		2528
		1030
		2546
		2508
		2507
		2364
		2353
		1708
		1707
		2035
		2017
		2015
		2427
		411
		2441
		2229
		2228
		1540
		894
		2429
		2158
		2417
		2390
		1478
		1476
		2342
		2320
		2318
		2315
		2314
		2323
		2308
		2301
		1490
		2293
		2289
		2288
		1751
		1473
		2278
		1543
		1541
		2267
		2265
		2263
		1455
		2242
		1466
		2161
		2157
		2147
		2120
		2112
		1782
		1401
		2107
		1868
		1983
		1969
		1967
		1948
		880
		1828
		1886
		1883
		1882
		1877
		1876
		1851
		1865
		1733
		1653
		1634
		1631
		1816
		1815
		1493
		1492
		1461
		1795
		1771
		1770
		1768
		1740
		1739
		1732
		1731
		1730
		1729
		1727
		1726
		1725
		1724
		1720
		1719
		1705
		1711
		1694
		1688
		597
		1573
		1572
		1571
		1570
		1568
		1567
		1471
		1559
		1558
		1547
		1487
		1479
		1472
		1469
		1468
		1460
		1463
		879
		1458
		1457
		1454
		1452
		632
		1400
		1393
		1041
		1043
		1042
		1039
		1038
		1037
		1036
		1035
		927
		541
		935
		1011
		448
		1003
		1002
		991
		990
		956
		937
		596
		911
		447
		434
		433
		427
		426
		421
		435
		607
		587
		584
		595
		410
		348
		585
		667
		666
		665
		664
		663
		662
		661
		660
		659
		658
		657
		656
		655
		654
		445
		594
		593
		591
		588
		544
		605
		603
		590
		589
		346
		360
		546
		545
		543
		542
		540
		530
		451
		450
		526
		361
		446
		444
		357
		No newline at end of file

deepchem/molnet/init.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -14,7 +14,7 @@ from deepchem.molnet.load_function.kaggle_datasets import load_kaggle
		from deepchem.molnet.load_function.lipo_datasets import load_lipo
		from deepchem.molnet.load_function.muv_datasets import load_muv
		from deepchem.molnet.load_function.nci_datasets import load_nci
		from deepchem.molnet.load_function.pcba_datasets import load_pcba
		from deepchem.molnet.load_function.pcba_datasets import load_pcba, load_pcba_146, load_pcba_2475
		from deepchem.molnet.load_function.pdbbind_datasets import load_pdbbind_grid
		from deepchem.molnet.load_function.ppb_datasets import load_ppb
		from deepchem.molnet.load_function.qm7_datasets import load_qm7

Admin message