Unverified Commit 8e8bc721 authored by Suzukazole's avatar Suzukazole
Browse files

Remove old

parent a3608ddd
Loading
Loading
Loading
Loading
+6 −105
Original line number Diff line number Diff line
@@ -31,9 +31,11 @@ class _USPTOLoader(_MolnetLoader):
  def create_dataset(self) -> DiskDataset:
    dataset_file = os.path.join(self.data_dir, "USPTO_MIT_test.csv")
    if not os.path.exists(dataset_file):
      dc.utils.data_utils.download_url(url=USPTO_MIT_TEST, dest_dir=self.data_dir)
    loader = dc.data.CSVLoader(
      tasks=self.tasks, feature_field="smiles", featurizer=self.featurizer)
      dc.utils.data_utils.download_url(url=USPTO_MIT_TEST,
                                       dest_dir=self.data_dir)
    loader = dc.data.CSVLoader(tasks=self.tasks,
                               feature_field="smiles",
                               featurizer=self.featurizer)
    return loader.create_dataset(dataset_file, shard_size=8192)


@@ -47,105 +49,4 @@ def load_uspto(
    **kwargs
) -> Tuple[List[str], Tuple[DiskDataset, ...], List[dc.trans.Transformer]]:














"""
def load_uspto(featurizer="plain", #what does the plain featurizer mean? what are the other featurizers?
               split=None,
               num_to_load=10000, # load the whole thing!
               reload=True, #what is reload?
               verbose=False, #what is verbose?
               data_dir=None, #ig this is okay
               save_dir=None,
               **kwargs): ##have to give option to load a particular subset. and option to separate reagents
  """
  """Load USPTO dataset.

  The USPTO Dataset consists of over a million reactions from United States
  patent applications(2001-2013) and the same again from patent grants
  (1976-2013). The loader can load the entire dataset or subsets of it.
  The subsets are USPTO_STEREO, USPTO_MIT and USPTO_50k. The STEREO dataset
  contains around a million reactions with stereochemical information,
  the 50k dataset consists of 50k reactions classified into 10 classes and
  the MIT dataset consists of around 470k reactions. 
  
  https://figshare.com/articles/Chemical_reactions_from_US_patents_1976-Sep2016_/5104873
  for more details. The full dataset contains some 400K reactions. This causes
  an out-of-memory error on development laptop if full dataset is featurized.
  For now, return a truncated subset of dataset.
  Reloading is not entirely supported for this dataset.
  """
  """
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  # Most reaction dataset ML tasks train the prediction of products from
  # reactants. Both of these are contained in the rxn object that is output,
  # so there is no "tasks" field.
  uspto_tasks = [] #check what is tasks, but im pretty sure there are none!
  if split is not None: ##have to change this, we have train/test/valid ready.
    raise ValueError("Train/valid/test not yet supported.")
  # Download USPTO dataset
  if reload:
    save_folder = os.path.join(save_dir, "uspto-featurized", str(featurizer))
    if featurizer == "smiles2img":
      img_spec = kwargs.get("img_spec", "std")
      save_folder = os.path.join(save_folder, img_spec)
    save_folder = os.path.join(save_folder, str(split))

    loaded, all_dataset, transformers = deepchem.utils.data_utils.load_dataset_from_disk(
        save_folder)
    if loaded:
      return uspto_tasks, all_dataset, transformers

  dataset_file = os.path.join(data_dir,
                              "2008-2011_USPTO_reactionSmiles_filtered.zip")
  if not os.path.exists(dataset_file): # no need to download? since its on AWS
    deepchem.utils.data_utils.download_url(url=USPTO_URL, dest_dir=data_dir)

  # Unzip #NO need to unzip, since its already in .csv
  unzip_dir = os.path.join(data_dir, "2008-2011_USPTO_reactionSmiles_filtered")
  if not os.path.exists(unzip_dir):
    deepchem.utils.data_utils.unzip_file(dataset_file, dest_dir=unzip_dir)
  # Unzipped file is a tap seperated values file (despite the .txt)
  filename = os.path.join(unzip_dir,
                          "2008-2011_USPTO_reactionSmiles_filtered.txt")
  rxns = []
  from rdkit.Chem import rdChemReactions
  with open(filename) as tsvfile:
    reader = csv.reader(tsvfile, delimiter="\t")
    for ind, row in enumerate(reader):
      if ind > num_to_load:
        break
      if verbose:
        print("Loading reaction %d" % ind)
      # The first element in the row is the reaction smarts
      smarts = row[0]
      # Sometimes smarts have extraneous information at end of form "
      # |f:0" that causes parsing to fail. Not sure what this information, ##yup i need to figure out what that |f thing means as well!
      # is, but just ignoring for now.
      smarts = smarts.split(" ")[0]
      rxn = rdChemReactions.ReactionFromSmarts(smarts)
      rxns.append(rxn)
  rxn_array = np.array(rxns)
  # Make up dummy labels since DiskDataset.from_numpy doesn't allow
  # creation from just features for now.
  y = np.ones(len(rxn_array))
  # TODO: This dataset isn't saved to disk so reload doesn't happen.
  rxn_dataset = DiskDataset.from_numpy(rxn_array, y)
  transformers = [] #what are these transformers?
  return uspto_tasks, (rxn_dataset, None, None), transformers #it returns a diskdataset.
"""
 No newline at end of file
  pass