Commit 3819b925 authored by leswing's avatar leswing
Browse files

Store metadata as hd5 file

parent 277432b7
Loading
Loading
Loading
Loading
+24 −10
Original line number Diff line number Diff line
@@ -4,13 +4,15 @@ Contains wrapper class for datasets.
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals
import json
import os
import numpy as np
import pandas as pd
import random
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import save_to_disk, save_metadata
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import log
from pandas import read_hdf
import tempfile
import time
import shutil
@@ -431,11 +433,7 @@ class DiskDataset(Dataset):
    self.verbose = verbose

    log("Loading dataset from disk.", self.verbose)
    if os.path.exists(self._get_metadata_filename()):
      (self.tasks,
       self.metadata_df) = load_from_disk(self._get_metadata_filename())
    else:
      raise ValueError("No metadata found on disk.")
    self.tasks, self.metadata_df = self.load_metadata()

  @staticmethod
  def create_dataset(shard_generator, data_dir=None, tasks=[], verbose=True):
@@ -464,12 +462,27 @@ class DiskDataset(Dataset):
          DiskDataset.write_data_to_disk(data_dir, basename, tasks, X, y, w,
                                         ids))
    metadata_df = DiskDataset._construct_metadata(metadata_rows)
    metadata_filename = os.path.join(data_dir, "metadata.joblib")
    save_to_disk((tasks, metadata_df), metadata_filename)
    save_metadata(tasks, metadata_df, data_dir)
    time2 = time.time()
    log("TIMING: dataset construction took %0.3f s" % (time2 - time1), verbose)
    return DiskDataset(data_dir, verbose=verbose)

  def load_metadata(self):
    try:
      tasks_filename, metadata_filename = self._get_metadata_filename()
      with open(tasks_filename) as fin:
        tasks = json.load(fin)
      metadata_df = read_hdf(metadata_filename, 'metadata')
      return tasks, metadata_df
    except Exception as e:
      pass

    metadata_filename = os.path.join(self.data_dir, "metadata.joblib")
    if os.path.exists(metadata_filename):
      tasks, metadata_df = load_from_disk(metadata_filename)
      save_metadata(tasks, metadata_df, self.data_dir)
    raise ValueError("No Metadata Found On Disk")

  @staticmethod
  def _construct_metadata(metadata_entries):
    """Construct a dataframe containing metadata.
@@ -590,8 +603,9 @@ class DiskDataset(Dataset):
    """
    Get standard location for metadata file.
    """
    metadata_filename = os.path.join(self.data_dir, "metadata.joblib")
    return metadata_filename
    metadata_filename = os.path.join(self.data_dir, "metadata.hd5")
    tasks_filename = os.path.join(self.data_dir, "tasks.json")
    return tasks_filename, metadata_filename

  def get_number_shards(self):
    """
+25 −1
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@ from __future__ import unicode_literals
import joblib
from sklearn.externals import joblib as old_joblib
import gzip
import json
import pickle
import pandas as pd
import numpy as np
@@ -103,6 +104,29 @@ def load_csv_files(filenames, shard_size=None, verbose=True):
        yield df


def save_metadata(tasks, metadata_df, data_dir):
  """
  Saves the metadata for a DiskDataset
  Parameters
  ----------
  tasks: list of str
    Tasks of DiskDataset
  metadata_df: pd.DataFrame
  data_dir: str
    Directory to store metadata
  Returns
  -------
  """

  metadata_filename = os.path.join(data_dir, "metadata.hd5")
  tasks_filename = os.path.join(data_dir, "tasks.json")
  with open(tasks_filename, 'w') as fout:
    json.dump(tasks, fout)
  hdf = pd.HDFStore(metadata_filename)
  hdf.put('metadata', metadata_df)
  hdf.close()


def load_from_disk(filename):
  """Load a dataset from file."""
  name = filename