Commit c27da14f authored by Paul Asmuth's avatar Paul Asmuth
Browse files

update StringUtil from eventql repo

parent 110410c2
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -20,5 +20,7 @@ add_library(signaltk-util STATIC
    file.cc
    outputstream.cc
    inputstream.cc
    ISO8601.cc)
    ISO8601.cc
    UTF8.cc
    wallclock.cc)

core/util/UTF8.cc

0 → 100644
+222 −0
Original line number Diff line number Diff line
/**
 * Copyright (c) 2016 DeepCortex GmbH <legal@eventql.io>
 * Authors:
 *   - Paul Asmuth <paul@eventql.io>
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Affero General Public License ("the license") as
 * published by the Free Software Foundation, either version 3 of the License,
 * or any later version.
 *
 * In accordance with Section 7(e) of the license, the licensing of the Program
 * under the license does not imply a trademark license. Therefore any rights,
 * title and interest in our trademarks remain entirely with us.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the license for more details.
 *
 * You can be released from the requirements of the license by purchasing a
 * commercial license. Buying such a license is mandatory as soon as you develop
 * commercial activities involving this program without disclosing the source
 * code of your own applications
 */
#include "UTF8.h"
#include "exception.h"

namespace signaltk {

char32_t UTF8::nextCodepoint(const char** cur, const char* end_) {
  auto begin = reinterpret_cast<const uint8_t*>(*cur);
  auto end = reinterpret_cast<const uint8_t*>(end_);

  if (begin >= end) {
    return 0;
  }

  if (*begin < 0b10000000) {
    return *(*cur)++;
  }

  if ((*begin & 0b11100000) == 0b11000000) {
    if (begin + 1 >= end) {
      throw std::invalid_argument("invalid UTF8 encoding");
    }

    char32_t chr;
    chr  = (*(*cur)++ & 0b00011111) << 6;
    chr |= (*(*cur)++ & 0b00111111);

    return chr;
  }

  if ((*begin & 0b11110000) == 0b11100000) {
    if (begin + 2 >= end) {
      throw std::invalid_argument("invalid UTF8 encoding");
    }

    char32_t chr;
    chr  = (*(*cur)++ & 0b00001111) << 12;
    chr |= (*(*cur)++ & 0b00111111) << 6;
    chr |= (*(*cur)++ & 0b00111111);

    return chr;
  }

  if ((*begin & 0b11111000) == 0b11110000) {
    if (begin + 3 >= end) {
      throw std::invalid_argument("invalid UTF8 encoding");
    }

    char32_t chr;
    chr  = (*(*cur)++ & 0b00000111) << 18;
    chr |= (*(*cur)++ & 0b00111111) << 12;
    chr |= (*(*cur)++ & 0b00111111) << 6;
    chr |= (*(*cur)++ & 0b00111111);

    return chr;
  }

  if ((*begin & 0b11111100) == 0b11111000) {
    if (begin + 4 >= end) {
      throw std::invalid_argument("invalid UTF8 encoding");
    }

    char32_t chr;
    chr  = (*(*cur)++ & 0b00000011) << 14;
    chr |= (*(*cur)++ & 0b00111111) << 18;
    chr |= (*(*cur)++ & 0b00111111) << 12;
    chr |= (*(*cur)++ & 0b00111111) << 6;
    chr |= (*(*cur)++ & 0b00111111);

    return chr;
  }

  if ((*begin & 0b11111110) == 0b11111100) {
    if (begin + 5 >= end) {
      throw std::invalid_argument("invalid UTF8 encoding");
    }

    char32_t chr;
    chr  = (*(*cur)++ & 0b00000001) << 30;
    chr |= (*(*cur)++ & 0b00111111) << 24;
    chr |= (*(*cur)++ & 0b00111111) << 18;
    chr |= (*(*cur)++ & 0b00111111) << 12;
    chr |= (*(*cur)++ & 0b00111111) << 6;
    chr |= (*(*cur)++ & 0b00111111);

    return chr;
  }

  throw std::invalid_argument("invalid UTF8 encoding");
}

bool UTF8::isValidUTF8(const String& str) {
  return UTF8::isValidUTF8(str.data(), str.size());
}

bool UTF8::isValidUTF8(const char* str, size_t size) {
  auto end = str + size;

  for (auto cur = str; cur < end; ) {
    if (*reinterpret_cast<const uint8_t*>(cur) < 0b10000000) {
      cur = cur + 1;
      return true;
    }

    if ((*reinterpret_cast<const uint8_t*>(cur) & 0b11100000) == 0b11000000) {
      if (cur + 1 >= end) {
        return false;
      } else {
        cur = cur + 2;
        continue;
      }
    }

    if ((*reinterpret_cast<const uint8_t*>(cur) & 0b11110000) == 0b11100000) {
      if (cur + 2 >= end) {
        return false;
      } else {
        cur = cur + 3;
        continue;
      }
    }

    if ((*reinterpret_cast<const uint8_t*>(cur) & 0b11111000) == 0b11110000) {
      if (cur + 3 >= end) {
        return false;
      } else {
        cur = cur + 4;
        continue;
      }
    }

    if ((*reinterpret_cast<const uint8_t*>(cur) & 0b11111100) == 0b11111000) {
      if (cur + 4 >= end) {
        return false;
      } else {
        cur = cur + 5;
        continue;
      }
    }

    if ((*reinterpret_cast<const uint8_t*>(cur) & 0b11111110) == 0b11111100) {
      if (cur + 5 >= end) {
        return false;
      } else {
        cur = cur + 6;
        continue;
      }
    }
  }

  return true;
}

void UTF8::encodeCodepoint(char32_t codepoint, String* target) {
  if (codepoint < 0b10000000) {
    *target += (char) codepoint;
    return;
  }

  if (codepoint < 0b100000000000) {
    *target += (char) (0b11000000 | ((codepoint >> 6) & 0b00011111));
    *target += (char) (0b10000000 | (codepoint        & 0b00111111));
    return;
  }

  else if (codepoint < 0b10000000000000000) {
    *target += (char) (0b11100000 | ((codepoint >> 12) & 0b00001111));
    *target += (char) (0b10000000 | ((codepoint >> 6)  & 0b00111111));
    *target += (char) (0b10000000 | (codepoint         & 0b00111111));
    return;
  }

  else if (codepoint < 0b1000000000000000000000) {
    *target += (char) (0b11110000 | ((codepoint >> 18) & 0b00000111));
    *target += (char) (0b10000000 | ((codepoint >> 12) & 0b00111111));
    *target += (char) (0b10000000 | ((codepoint >> 6)  & 0b00111111));
    *target += (char) (0b10000000 | (codepoint         & 0b00111111));
    return;
  }

  else if (codepoint < 0b100000000000000000000000000) {
    *target += (char) (0b11111000 | ((codepoint >> 24) & 0b00000011));
    *target += (char) (0b10000000 | ((codepoint >> 18) & 0b00111111));
    *target += (char) (0b10000000 | ((codepoint >> 12) & 0b00111111));
    *target += (char) (0b10000000 | ((codepoint >> 6)  & 0b00111111));
    *target += (char) (0b10000000 | (codepoint         & 0b00111111));
  }

  else {
    *target += (char) (0b11111100 | ((codepoint >> 30) & 0b00000001));
    *target += (char) (0b10000000 | ((codepoint >> 24) & 0b00111111));
    *target += (char) (0b10000000 | ((codepoint >> 18) & 0b00111111));
    *target += (char) (0b10000000 | ((codepoint >> 11) & 0b00111111));
    *target += (char) (0b10000000 | ((codepoint >> 6)  & 0b00111111));
    *target += (char) (0b10000000 | (codepoint         & 0b00111111));
  }
}

} // namespace signaltk

core/util/UTF8.h

0 → 100644
+46 −0
Original line number Diff line number Diff line
/**
 * Copyright (c) 2016 DeepCortex GmbH <legal@eventql.io>
 * Authors:
 *   - Paul Asmuth <paul@eventql.io>
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Affero General Public License ("the license") as
 * published by the Free Software Foundation, either version 3 of the License,
 * or any later version.
 *
 * In accordance with Section 7(e) of the license, the licensing of the Program
 * under the license does not imply a trademark license. Therefore any rights,
 * title and interest in our trademarks remain entirely with us.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the license for more details.
 *
 * You can be released from the requirements of the license by purchasing a
 * commercial license. Buying such a license is mandatory as soon as you develop
 * commercial activities involving this program without disclosing the source
 * code of your own applications
 */
#pragma once
#include <stdlib.h>
#include <stdint.h>
#include <string>
#include <vector>
#include <locale>
#include "stdtypes.h"

namespace signaltk {

class UTF8 {
public:

  static char32_t nextCodepoint(const char** cur, const char* end);

  static void encodeCodepoint(char32_t codepoint, String* target);

  static bool isValidUTF8(const String& str);
  static bool isValidUTF8(const char* str, size_t size);

};

} // namespace signaltk
+297 −13
Original line number Diff line number Diff line
/**
 * This file is part of the "FnordMetric" project
 *   Copyright (c) 2018 Paul Asmuth
 *   Copyright (c) 2014 Paul Asmuth, Google Inc.
 *
 * FnordMetric is free software: you can redistribute it and/or modify it under
@@ -8,8 +9,10 @@
 * <http://www.gnu.org/licenses/>.
 */
#include <string>
#include <assert.h>
#include "bufferutil.h"
#include "stringutil.h"
#include "UTF8.h"

namespace signaltk {

@@ -25,6 +28,11 @@ std::string StringUtil::toString(const char* value) {
  return value;
}

template <>
std::string StringUtil::toString(char* value) {
  return value;
}

template <>
std::string StringUtil::toString(int value) {
  return std::to_string(value);
@@ -80,8 +88,17 @@ template <>
std::string StringUtil::toString(double value) {
  char buf[128]; // FIXPAUL
  *buf = 0;
  snprintf(buf, sizeof(buf), "%f", value);
  return buf;

  auto len = snprintf(buf, sizeof(buf), "%f", value);
  if (len < 0) {
    return std::string{};
  }

  while (len > 2 && buf[len - 1] == '0' && buf[len - 2] != '.') {
    buf[len--] = 0;
  }

  return String(buf, len);
}

template <>
@@ -89,12 +106,30 @@ std::string StringUtil::toString(bool value) {
  return value ? "true" : "false";
}

void StringUtil::ltrim(std::string* str) {
  while (str->front() == ' ') {
    str->erase(str->begin());
  }
}

void StringUtil::rtrim(std::string* str) {
  while (str->back() == ' ') {
    str->pop_back();
  }
}

void StringUtil::stripTrailingSlashes(std::string* str) {
  while (str->back() == '/') {
    str->pop_back();
  }
}

void StringUtil::chomp(std::string* str) {
  while (str->back() == '\n' || str->back() == '\r') {
    str->pop_back();
  }
}

void StringUtil::replaceAll(
    std::string* str,
    const std::string& pattern,
@@ -103,7 +138,7 @@ void StringUtil::replaceAll(
    return;
  }

  auto cur = 0;
  size_t cur = 0;
  while ((cur = str->find(pattern, cur)) != std::string::npos) {
    str->replace(cur, pattern.size(), replacement);
    cur += replacement.size();
@@ -113,6 +148,7 @@ void StringUtil::replaceAll(
std::vector<std::string> StringUtil::split(
      const std::string& str,
      const std::string& pattern) {
  assert(!pattern.empty());
  std::vector<std::string> parts;

  size_t begin = 0;
@@ -124,7 +160,7 @@ std::vector<std::string> StringUtil::split(
      break;
    } else {
      parts.emplace_back(str.substr(begin, end - begin));
      begin = end + pattern.length();
      begin = end + pattern.size();
    }
  }

@@ -134,7 +170,7 @@ std::vector<std::string> StringUtil::split(
String StringUtil::join(const Vector<String>& list, const String& join) {
  String out;

  for (int i = 0; i < list.size(); ++i) {
  for (size_t i = 0; i < list.size(); ++i) {
    if (i > 0) {
      out += join;
    }
@@ -145,6 +181,21 @@ String StringUtil::join(const Vector<String>& list, const String& join) {
  return out;
}

String StringUtil::join(const Set<String>& list, const String& join) {
  String out;

  size_t i = 0;
  for (const auto& item : list) {
    if (++i > 1) {
      out += join;
    }

    out += item;
  }

  return out;
}

bool StringUtil::beginsWith(const std::string& str, const std::string& prefix) {
  if (str.length() < prefix.length()) {
    return false;
@@ -168,6 +219,176 @@ bool StringUtil::endsWith(const std::string& str, const std::string& suffix) {
      suffix) == 0;
}

int StringUtil::compare(
    const char* s1,
    size_t s1_len,
    const char* s2,
    size_t s2_len) {
  for (; s1_len > 0 && s2_len > 0; s1++, s2++, --s1_len, --s2_len) {
    if (*s1 != *s2) {
      return (*(uint8_t *) s1 < *(uint8_t *) s2) ? -1 : 1;
    }
  }

  if (s1_len > 0) {
    return 1;
  }

  if (s2_len > 0) {
    return -1;
  }

  return 0;
}


bool StringUtil::isHexString(const std::string& str) {
  for (const auto& c : str) {
    if ((c >= '0' && c <= '9') ||
        (c >= 'a' && c <= 'f') ||
        (c >= 'A' && c <= 'F')) {
      continue;
    }

    return false;
  }

  return true;
}

bool StringUtil::isAlphanumeric(const std::string& str) {
  for (const auto& c : str) {
    if (!isAlphanumeric(c)) {
      return false;
    }
  }

  return true;
}

bool StringUtil::isAlphanumeric(char chr) {
  bool is_alphanum =
      (chr >= '0' && chr <= '9') ||
      (chr >= 'a' && chr <= 'z') ||
      (chr >= 'A' && chr <= 'Z');

  return is_alphanum;
}

bool StringUtil::isShellSafe(const std::string& str) {
  for (const auto& c : str) {
    if (!isShellSafe(c)) {
      return false;
    }
  }

  return true;
}

bool StringUtil::isShellSafe(char chr) {
  bool is_safe =
      (chr >= '0' && chr <= '9') ||
      (chr >= 'a' && chr <= 'z') ||
      (chr >= 'A' && chr <= 'Z') ||
      (chr == '_') ||
      (chr == '-') ||
      (chr == '.');

  return is_safe;
}

bool StringUtil::isDigitString(const std::string& str) {
  return isDigitString(str.data(), str.data() + str.size());
}

bool StringUtil::isDigitString(const char* begin, const char* end) {
  for (auto cur = begin; cur < end; ++cur) {
    if (!isdigit(*cur)) {
      return false;
    }
  }

  return true;
}

bool StringUtil::isNumber(const std::string& str) {
  return isNumber(str.data(), str.data() + str.size());
}

bool StringUtil::isNumber(const char* begin, const char* end) {
  auto cur = begin;

  if (cur < end && *cur == '-') {
    ++cur;
  }

  for (; cur < end; ++cur) {
    if (!isdigit(*cur)) {
      return false;
    }
  }

  if (cur < end && (*cur == '.' || *cur == ',')) {
    ++cur;
  }

  for (; cur < end; ++cur) {
    if (!isdigit(*cur)) {
      return false;
    }
  }

  return true;
}

void StringUtil::toLower(std::string* str) {
  auto& str_ref = *str;

  for (size_t i = 0; i < str_ref.length(); ++i) {
    str_ref[i] = std::tolower(str_ref[i]);
  }
}

void StringUtil::toUpper(std::string* str) {
  auto& str_ref = *str;

  for (size_t i = 0; i < str_ref.length(); ++i) {
    str_ref[i] = std::toupper(str_ref[i]);
  }
}

size_t StringUtil::find(const std::string& str, char chr) {
  for (size_t i = 0; i < str.length(); ++i) {
    if (str[i] == chr) {
      return i;
    }
  }

  return -1;
}

size_t StringUtil::findLast(const std::string& str, char chr) {
  for (int i = str.length() - 1; i >= 0; --i) {
    if (str[i] == chr) {
      return i;
    }
  }

  return -1;
}

bool StringUtil::includes(const std::string& str, const std::string& subject) {
  return str.find(subject) != std::string::npos;
}

bool StringUtil::includesi(const std::string& str, const std::string& subject) {
  auto str_i = str;
  StringUtil::toLower(&str_i);
  auto subject_i = subject;
  StringUtil::toLower(&subject_i);
  return includes(str_i, subject_i);
}

std::string StringUtil::hexPrint(
    const void* data,
    size_t size,
@@ -182,7 +403,7 @@ std::string StringUtil::formatv(
    std::vector<std::string> values) {
  std::string str = fmt;

  for (int i = 0; i < values.size(); ++i) {
  for (size_t i = 0; i < values.size(); ++i) {
    StringUtil::replaceAll(
        &str,
        "$" + std::to_string(i),
@@ -192,15 +413,78 @@ std::string StringUtil::formatv(
  return str;
}

std::wstring StringUtil::convertUTF8To16(const std::string& str) {
  std::wstring out;
  out.assign(str.begin(), str.end());
std::basic_string<char32_t> StringUtil::convertUTF8To32(
    const std::basic_string<char>& str) {
  std::basic_string<char32_t> out;

  const char* cur = str.data();
  const char* end = cur + str.length();
  char32_t chr;
  while ((chr = UTF8::nextCodepoint(&cur, end)) > 0) {
    out += chr;
  }

  return out;
}


std::basic_string<char16_t> StringUtil::convertUTF8To16(
    const std::basic_string<char>& str) {
  std::basic_string<char16_t> out;

  const char* cur = str.data();
  const char* end = cur + str.length();
  char16_t chr;
  while ((chr = UTF8::nextCodepoint(&cur, end)) > 0) {
    out += chr;
  }

  return out;
}

std::basic_string<char> StringUtil::convertUTF32To8(
    const std::basic_string<char32_t>& str) {
  String out;

  for (const auto& c : str) {
    UTF8::encodeCodepoint(c, &out);
  }

  return out;
}

std::string StringUtil::convertUTF16To8(const std::wstring& str) {
  std::string out;
  out.assign(str.begin(), str.end());
std::basic_string<char> StringUtil::convertUTF16To8(
    const std::basic_string<char16_t>& str) {
  String out;

  for (const auto& c : str) {
    UTF8::encodeCodepoint(c, &out);
  }

  return out;
}

size_t StringUtil::countUTF8CodePoints(const std::string& str) {
  size_t count = 0;
  const char* cur = str.data();
  const char* end = cur + str.length();
  while (UTF8::nextCodepoint(&cur, end) != 0) {
    ++count;
  }

  return count;
}


String StringUtil::stripShell(const std::string& str) {
  String out;

  for (const auto& c : str) {
    if (isAlphanumeric(c) || c == '_' || c == '-' || c == '.') {
      out += c;
    }
  }

  return out;
}

+163 −2
Original line number Diff line number Diff line
/**
 * This file is part of the "FnordMetric" project
 *   Copyright (c) 2018 Paul Asmuth
 *   Copyright (c) 2014 Paul Asmuth, Google Inc.
 *
 * FnordMetric is free software: you can redistribute it and/or modify it under
@@ -48,6 +49,35 @@ public:
   */
  static void stripTrailingSlashes(std::string* str);

  /**
   * Remove trailing newlines
   *
   * @param str the string to remove trailing newlines from
   */
  static void chomp(std::string* str);

  /**
   * Remove leading whitespace from the string
   *
   * @param str the string to remove leading whitespace from
   */
  static void ltrim(std::string* str);

  /**
   * Remove trailing whitespace from the string
   *
   * @param str the string to remove trailing whitespace from
   */
  static void rtrim(std::string* str);

  /**
   * Remove all characters except [A-Za-z0-9_-.] from the string
   *
   * @param str the string to remove chars from
   * @return the stripped string
   */
  static String stripShell(const std::string& str);

  /**
   * Check if the provided string begins with the provided prefix
   *
@@ -70,6 +100,73 @@ public:
      const std::string& str,
      const std::string& suffix);

  /**
   * Lexicographically compare the strings s1 and s2. Return an integer greater
   * than, equal to, or less than 0, according as the string s1 is greater than,
   * equal to, or less than the string s2.
   *
   * The comparison is done using unsigned characters, so that `\200' is greater
   * than `\0'.
   */
  static int compare(
      const char* s1,
      size_t s1_len,
      const char* s2,
      size_t s2_len);

  /**
   * Check if the provided string contains only 0-9a-fA-f
   *
   * @param str the string to check
   */
  static bool isHexString(const std::string& str);

  /**
   * Check if the provided string consists only of 0-9a-zA-Z
   *
   * @param chr the char to check
   */
  static bool isAlphanumeric(const String& string);

  /**
   * Check if the provided char is one of 0-9a-zA-Z
   *
   * @param chr the char to check
   */
  static bool isAlphanumeric(char chr);

  /**
   * Check if the provided string consists only of 0-9a-zA-Z-_.
   *
   * @param chr the char to check
   */
  static bool isShellSafe(const String& string);

  /**
   * Check if the provided char is one of 0-9a-zA-Z-_.
   *
   * @param chr the char to check
   */
  static bool isShellSafe(char chr);

  /**
   * Check if the provided string consists only of digits
   *
   * @param str the string to check
   * @return true if the string consists only of digits, false otherwise
   */
  static bool isDigitString(const std::string& str);
  static bool isDigitString(const char* begin, const char* end);

  /**
   * Check if the provided string matches the pattern ^-?[0-9]+([,\.][0-9])$
   *
   * @param str the string to check
   * @return true if the string consists only of digits, false otherwise
   */
  static bool isNumber(const std::string& str);
  static bool isNumber(const char* begin, const char* end);

  /**
   * Replace all occurences of pattern with replacement in str
   *
@@ -93,6 +190,58 @@ public:
   * Join the provided string array with the provided join string
   */
  static String join(const Vector<String>& list, const String& join);
  static String join(const Set<String>& list, const String& join);

  /**
   * Converts the provided string to all lowercase
   *
   * @param str the str that should be modified
   */
  static void toLower(std::string* str);

  /**
   * Converts the provided string to all uppercase
   *
   * @param str the str that should be modified
   */
  static void toUpper(std::string* str);

  /**
   * Finds the first occurence of the provided character in the string. Returns
   * std::string::npos if the character does not occur in the string.
   *
   * @param str the str that should be searched
   * @param chr the char to search for
   * @return the position of the first occurence of chr or std::string::npos
   */
  static size_t find(const std::string& str, char chr);

  /**
   * Finds the last occurence of the provided character in the string. Returns
   * std::string::npos if the character does not occur in the string.
   *
   * @param str the str that should be searched
   * @param chr the char to search for
   * @return the position of the last occurence of chr or std::string::npos
   */
  static size_t findLast(const std::string& str, char chr);

  /**
   * Returns true iff str contains the subject or is equal ot the subject
   *
   * @param str the str to be searched
   * @param subject the substring to search for
   */
  static bool includes(const std::string& str, const std::string& subject);

  /**
   * Returns true iff str contains the subject or is equal ot the subject,
   * matches in a case-insensitive fashion
   *
   * @param str the str to be searched
   * @param subject the substring to search for
   */
  static bool includesi(const std::string& str, const std::string& subject);

  /**
   * Print the contents of the pointed to memory as a series of hexadecimal
@@ -214,8 +363,20 @@ public:
  template <typename T>
  static std::string formatNumberScientific(T value);

  static std::wstring convertUTF8To16(const std::string& str);
  static std::string convertUTF16To8(const std::wstring& str);

  static std::basic_string<char32_t> convertUTF8To32(
      const std::basic_string<char>& str);

  static std::basic_string<char16_t> convertUTF8To16(
      const std::basic_string<char>& str);

  static std::basic_string<char> convertUTF32To8(
      const std::basic_string<char32_t>& str);

  static std::basic_string<char> convertUTF16To8(
      const std::basic_string<char16_t>& str);

  static size_t countUTF8CodePoints(const std::string& str);

protected:

Loading