Commit 3a0b5a89 authored by Paul Asmuth's avatar Paul Asmuth
Browse files

update the CSV parser to RFC4180

parent 4e6591c5
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -113,7 +113,9 @@ foreach(unit_test_path ${unit_test_files})

  add_test(
      NAME test-unit-${unit_test_name}
      COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${unit_test_name}
      COMMAND ${CMAKE_COMMAND} -E env
          FVIZ_TEST_SRCDIR=${CMAKE_SOURCE_DIR}
          ${CMAKE_CURRENT_BINARY_DIR}/${unit_test_name}
      DEPENDS fviz-cli test-prepare)

  set_tests_properties(
+1 −2
Original line number Diff line number Diff line
@@ -110,8 +110,7 @@ ReturnCode data_load_strings_csv(
  }

  auto data_csv = CSVData{};
  CSVParserConfig parser_opts;
  if (auto rc = parseCSV(data_str, parser_opts, &data_csv); !rc) {
  if (auto rc = csv_parse(data_str, &data_csv); !rc) {
    return rc;
  }

+133 −57
Original line number Diff line number Diff line
/**
 * Copyright (c) 2016 DeepCortex GmbH <legal@eventql.io>
 * Authors:
 *   - Paul Asmuth <paul@eventql.io>
 * This file is part of the "fviz" project
 *   Copyright (c) 2018 Paul Asmuth
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Affero General Public License ("the license") as
 * published by the Free Software Foundation, either version 3 of the License,
 * or any later version.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 *
 * In accordance with Section 7(e) of the license, the licensing of the Program
 * under the license does not imply a trademark license. Therefore any rights,
 * title and interest in our trademarks remain entirely with us.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the license for more details.
 *
 * You can be released from the requirements of the license by purchasing a
 * commercial license. Buying such a license is mandatory as soon as you develop
 * commercial activities involving this program without disclosing the source
 * code of your own applications
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#include "csv.h"
#include <iostream>

namespace fviz {

ReturnCode parseCSV(
ReturnCode csv_parse_field_escaped(
    std::string input,
    const CSVParserConfig& opts,
    std::list<std::vector<std::string>>* output) {
  input.push_back(0);

  std::vector<std::string> row;
  std::string buffer;
  bool quoted = false;
  bool escaped = false;
  size_t row_index = 0;

  for (const auto& byte : input) {
    if (byte == opts.escape_char) {
      if (escaped) {
        buffer += opts.escape_char;
        escaped = false;
      } else {
        escaped = true;
    size_t* offset,
    std::string* field) {
  if (input[(*offset)++] != '"') {
    return error(
        ERROR,
        "RFC4180: quoted strings must start with a double quote (\") character");
  }

  while (*offset != input.size()) {
    if (*offset + 1 != input.size() &&
        input[*offset] == '"' &&
        input[*offset + 1] == '"') {
      // escaped quote char
      field->push_back('"');
      *offset += 2;
      continue;
    } else if (input[*offset] == '"') {
      // string terminator
      *offset += 1;
      return OK;
    } else {
      // consume character
      field->push_back(input[(*offset)++]);
      continue;
    }
  }

  return errorf(ERROR, "unterminated string: '{}'", *field);
}

    if (!escaped && byte == opts.quote_char) {
      quoted = !quoted;
ReturnCode csv_parse_field_literal(
    std::string input,
    size_t* offset,
    std::string* field) {
  while (*offset != input.size()) {
    switch (input[*offset]) {
      // string terminator
      case '\r':
      case '\n':
      case ',':
        return OK;
      // literal char
      default:
        field->push_back(input[(*offset)++]);
        continue;
    }
  }

  return OK;
}

ReturnCode csv_parse_field(
    std::string input,
    size_t* offset,
    std::string* field) {
  if (input[*offset] == '"') {
    return csv_parse_field_escaped(input, offset, field);
  } else {
    return csv_parse_field_literal(input, offset, field);
  }

    if ((!quoted && byte == opts.column_separator) ||
        (!quoted && byte == opts.line_separator) ||
        (!quoted && byte == 0)) {
      row.emplace_back(buffer);
      buffer.clear();
  return OK;
}

ReturnCode csv_parse_record(
    std::string input,
    size_t* offset,
    std::vector<std::string>* record) {
  while (*offset != input.size()) {
    // parse next field
    std::string field;
    if (auto rc = csv_parse_field(input, offset, &field); rc) {
      record->push_back(field);
    } else {
      return rc;
    }

      if (byte == opts.line_separator) {
        ++row_index;
        output->push_back(row);
        row.clear();
    // end of file
    if (*offset == input.size()) {
      break;
    }

    // field terminator
    switch (input[*offset]) {
      // newline
      case '\r':
      case '\n':
        return OK;
      // next field
      case ',':
        ++(*offset);
        continue;
      // invalid seperator
      default:
        return error(
            ERROR,
            "RFC4180: fields must be separated by a comma (,) character");
    }
  }

    buffer += byte;
    escaped = false;
  return OK;
}

  if (quoted || buffer.size() > 0) {
    return error(ERROR, "invalid csv line");
ReturnCode csv_parse(std::string input, CSVData* output) {
  for (size_t offset = 0; offset != input.size(); ) {
    // next record
    std::vector<std::string> record;
    if (auto rc = csv_parse_record(input, &offset, &record)) {
      output->push_back(record);
    } else {
      return rc;
    }

    // end of file
    if (offset == input.size()) {
      break;
    }

    // end of line (CRLF)
    switch (input[offset]) {
      case '\r':
        if (offset + 1 == input.size() || input[++offset] != '\n') {
          return error(
              ERROR,
              "RFC4180: the carriage return ('\\r') character must be followed "
              "by a newline ('\\n') character unless escaped");
        } else {
          /* fallthrough */
        }
      case '\n':
        ++offset;
        break;
    }
  }

  return OK;
@@ -81,4 +158,3 @@ ReturnCode parseCSV(

} // namespace fviz
+13 −35
Original line number Diff line number Diff line
/**
 * Copyright (c) 2016 DeepCortex GmbH <legal@eventql.io>
 * Authors:
 *   - Paul Asmuth <paul@eventql.io>
 * This file is part of the "fviz" project
 *   Copyright (c) 2018 Paul Asmuth
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Affero General Public License ("the license") as
 * published by the Free Software Foundation, either version 3 of the License,
 * or any later version.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 *
 * In accordance with Section 7(e) of the license, the licensing of the Program
 * under the license does not imply a trademark license. Therefore any rights,
 * title and interest in our trademarks remain entirely with us.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the license for more details.
 *
 * You can be released from the requirements of the license by purchasing a
 * commercial license. Buying such a license is mandatory as soon as you develop
 * commercial activities involving this program without disclosing the source
 * code of your own applications
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#pragma once
#include <string>
@@ -28,24 +18,12 @@

namespace fviz {

struct CSVParserConfig {
  CSVParserConfig() :
      line_separator('\n'),
      column_separator(','),
      quote_char('\"'),
      escape_char('\\') {}

  char line_separator;
  char column_separator;
  char quote_char;
  char escape_char;
};

using CSVData = std::list<std::vector<std::string>>;
using CSVData = std::vector<std::vector<std::string>>;

ReturnCode parseCSV(
// Parse a RFC4180 compliant CSV file
// https://tools.ietf.org/html/rfc4180
ReturnCode csv_parse(
    std::string input,
    const CSVParserConfig& config,
    CSVData* output);

} // namespace fviz
+20 −0
Original line number Diff line number Diff line
(plot
    limit-x (0.5 6.5)
    limit-y (-20 70)
    axes (top right bottom left)
    axis-y-label-placement (linear-align 10)
    scale-x (categorical (csv "tests/testdata/bardata_quoted.csv" var6))
    bars (
      data-x (csv "tests/testdata/bardata_quoted.csv" var6)
      data-y (csv "tests/testdata/bardata.csv" var1)
      data-y-low (csv "tests/testdata/bardata.csv" var2)
      bar-width (1em)
      bar-offset (-.8em)
      color #ccc)
    bars (
      data-x (csv "tests/testdata/bardata_quoted.csv" var6)
      data-y (csv "tests/testdata/bardata.csv" var4)
      data-y-low (csv "tests/testdata/bardata.csv" var5)
      bar-width (1em)
      bar-offset (.8em)
      color #666))
Loading