Commit bc1dd7cd authored by leswing's avatar leswing
Browse files

gzip csv vs hd5

parent e19323f2
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -472,7 +472,7 @@ class DiskDataset(Dataset):
      tasks_filename, metadata_filename = self._get_metadata_filename()
      with open(tasks_filename) as fin:
        tasks = json.load(fin)
      metadata_df = read_hdf(metadata_filename, 'metadata')
      metadata_df = pd.read_csv(metadata_filename, compression='gzip')
      return tasks, metadata_df
    except Exception as e:
      pass
@@ -607,7 +607,7 @@ class DiskDataset(Dataset):
    """
    Get standard location for metadata file.
    """
    metadata_filename = os.path.join(self.data_dir, "metadata.hd5")
    metadata_filename = os.path.join(self.data_dir, "metadata.csv.gzip")
    tasks_filename = os.path.join(self.data_dir, "tasks.json")
    return tasks_filename, metadata_filename

+2 −2
Original line number Diff line number Diff line
@@ -292,7 +292,7 @@ class TestSplitters(unittest.TestCase):
    y[:n_positives] = 1
    w = np.ones((n_samples, n_tasks))
    # Set half the positives to have zero weight
    w[:n_positives / 2] = 0
    w[:n_positives // 2] = 0
    ids = np.arange(n_samples)

    stratified_splitter = dc.splits.RandomStratifiedSplitter()
@@ -340,7 +340,7 @@ class TestSplitters(unittest.TestCase):
    y = np.random.binomial(1, p, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    # Mask half the examples
    w[:n_samples / 2] = 0
    w[:n_samples // 2] = 0

    stratified_splitter = dc.splits.RandomStratifiedSplitter()
    split_indices = stratified_splitter.get_task_split_indices(
+2 −4
Original line number Diff line number Diff line
@@ -119,13 +119,11 @@ def save_metadata(tasks, metadata_df, data_dir):
  """
  if isinstance(tasks, np.ndarray):
    tasks = tasks.tolist()
  metadata_filename = os.path.join(data_dir, "metadata.hd5")
  metadata_filename = os.path.join(data_dir, "metadata.csv.gzip")
  tasks_filename = os.path.join(data_dir, "tasks.json")
  with open(tasks_filename, 'w') as fout:
    json.dump(tasks, fout)
  hdf = pd.HDFStore(metadata_filename)
  hdf.put('metadata', metadata_df, format='table')
  hdf.close()
  metadata_df.to_csv(metadata_filename, compression='gzip')


def load_from_disk(filename):