implement the first three convenience utility functions (7fcc76f0) · Commits · 郑智淋 / lammps

src/utils.cpp

0 → 100644

+439 −0

Original line number	Diff line number	Diff line
		/* ----------------------------------------------------------------------
		LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
		http://lammps.sandia.gov, Sandia National Laboratories
		Steve Plimpton, sjplimp@sandia.gov

		Copyright (2003) Sandia Corporation. Under the terms of Contract
		DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
		certain rights in this software. This software is distributed under
		the GNU General Public License.

		See the README file in the top-level LAMMPS directory.
		------------------------------------------------------------------------- */

		#include <cstring>
		#include "utils.h"
		#include "error.h"

		/! \file utils.cpp /

		/*
		* Mini regex-module adapted from https://github.com/kokke/tiny-regex-c
		* which is in the public domain.
		*
		* Supports:
		* ---------
		* '.' Dot, matches any character
		* '^' Start anchor, matches beginning of string
		* '$' End anchor, matches end of string
		* '*' Asterisk, match zero or more (greedy)
		* '+' Plus, match one or more (greedy)
		* '?' Question, match zero or one (non-greedy)
		* '[abc]' Character class, match if one of {'a', 'b', 'c'}
		* '[a-zA-Z]' Character ranges, the character set of the ranges { a-z \| A-Z }
		* '\s' Whitespace, \t \f \r \n \v and spaces
		* '\S' Non-whitespace
		* '\w' Alphanumeric, [a-zA-Z0-9_]
		* '\W' Non-alphanumeric
		* '\d' Digits, [0-9]
		* '\D' Non-digits
		*
		* NOT supported:
		* '[^abc]' Inverted class
		* 'a\|b' Branches
		* '(abc)+' Groups
		*/

		extern "C"
		{
		/** Match text against a (simplified) regular expression
		* (regexp will be compiled automatically). */
		static int re_match(const char text, const char pattern);
		}

		using namespace LAMMPS_NS;

		/** More flexible and specific matching of a string against a pattern.
		* This function is supposed to be a more safe, more specific and
		* simple to use API to find pattern matches. The purpose is to replace
		* uses of either strncmp() or strstr() in the code base to find
		* substrings safely. With strncmp() finding prefixes, the number of
		* characters to match must be counted, which can lead to errors,
		* while using "^pattern" will do the same with less problems.
		* Matching for suffixes using strstr() is not as specific as 'pattern$',
		* and complex matches, e.g. "^rigid.\/small.", to match all small
		* body optimized rigid fixes require only one test.
		*
		* The use of std::string arguments allows for simple concatenation
		* even with char * type variables.
		* Example: utils::strmatch(text, std::string("^") + charptr)
		*/
		bool utils::strmatch(std::string text, std::string pattern)
		{
		const int pos = re_match(text.c_str(),pattern.c_str());
		return (pos >= 0);
		}

		/* utility function to avoid code repetition when parsing args */
		int utils::cfvarg(std::string mode, const char arg, char &cfv_id)
		{
		int rv = utils::NONE;
		cfv_id = NULL;

		if (!arg) return rv;

		if (utils::strmatch(arg,std::string("^[") + mode + "]_")) {
		if (*arg == 'c') rv = utils::COMPUTE;
		else if (*arg == 'f') rv = utils::FIX;
		else if (*arg == 'v') rv = utils::VARIABLE;
		else return rv; // should not happen

		arg += 2;
		int n = strlen(arg)+1;
		cfv_id = new char[n];
		strcpy(cfv_id,arg);
		}

		return rv;
		}

		/* like fgets() but aborts with an error or EOF is encountered */
		void utils::sfgets(char* srcname, int srcline, char *s, int size,
		FILE fp, std::string filename, Error error)
		{
		char *rv = fgets(s,size,fp);
		if (rv == NULL) { // something went wrong
		std::string errmsg;

		if (feof(fp)) {
		errmsg = "Unexpected end of file while reading file '";
		} else if (ferror(fp)) {
		errmsg = "Unexpected error while reading file '";
		} else {
		errmsg = "Unexpected short read while reading file '";
		}
		errmsg += filename + "'";

		if (error) error->one(srcname,srcline,errmsg.c_str());
		if (s) *s = '\0'; // truncate string to empty in case error is NULL
		}
		return;
		}

		/* ------------------------------------------------------------------ */

		extern "C" {
		/* Typedef'd pointer to get abstract datatype. */
		typedef struct regex_t *re_t;

		/* Compile regex string pattern to a regex_t-array. */
		static re_t re_compile(const char *pattern);


		/* Find matches of the compiled pattern inside text. */
		static int re_matchp(const char *text, re_t pattern);


		/* Definitions: */

		#define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */
		#define MAX_CHAR_CLASS_LEN 40 /* Max length of character-class buffer in. */


		enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS,
		CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT,
		ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE /, BRANCH / };

		typedef struct regex_t {
		unsigned char type; /* CHAR, STAR, etc. */
		union {
		unsigned char ch; /* the character itself */
		unsigned char ccl; / OR a pointer to characters in class */
		};
		} regex_t;

		/* Private function declarations: */
		static int matchpattern(regex_t pattern, const char text);
		static int matchcharclass(char c, const char *str);
		static int matchstar(regex_t p, regex_t pattern, const char text);
		static int matchplus(regex_t p, regex_t pattern, const char text);
		static int matchone(regex_t p, char c);
		static int matchdigit(char c);
		static int matchalpha(char c);
		static int matchwhitespace(char c);
		static int matchmetachar(char c, const char *str);
		static int matchrange(char c, const char *str);
		static int ismetachar(char c);

		/* Semi-public functions: */
		int re_match(const char text, const char pattern)
		{
		return re_matchp(text, re_compile(pattern));
		}

		int re_matchp(const char *text, re_t pattern)
		{
		if (pattern != 0) {
		if (pattern[0].type == BEGIN) {
		return ((matchpattern(&pattern[1], text)) ? 0 : -1);
		} else {
		int idx = -1;

		do {
		idx += 1;

		if (matchpattern(pattern, text)) {
		if (text[0] == '\0')
		return -1;

		return idx;
		}
		}
		while (*text++ != '\0');
		}
		}
		return -1;
		}

		re_t re_compile(const char *pattern)
		{
		/* The sizes of the two static arrays below substantiates the static RAM usage of this module.
		MAX_REGEXP_OBJECTS is the max number of symbols in the expression.
		MAX_CHAR_CLASS_LEN determines the size of buffer for chars in all char-classes in the expression. */
		static regex_t re_compiled[MAX_REGEXP_OBJECTS];
		static unsigned char ccl_buf[MAX_CHAR_CLASS_LEN];
		int ccl_bufidx = 1;

		char c; /* current char in pattern */
		int i = 0; /* index into pattern */
		int j = 0; /* index into re_compiled */

		while (pattern[i] != '\0' && (j+1 < MAX_REGEXP_OBJECTS)) {
		c = pattern[i];

		switch (c) {
		/* Meta-characters: */
		case '^': { re_compiled[j].type = BEGIN; } break;
		case '$': { re_compiled[j].type = END; } break;
		case '.': { re_compiled[j].type = DOT; } break;
		case '*': { re_compiled[j].type = STAR; } break;
		case '+': { re_compiled[j].type = PLUS; } break;
		case '?': { re_compiled[j].type = QUESTIONMARK; } break;

		/* Escaped character-classes (\s \w ...): */
		case '\\': {
		if (pattern[i+1] != '\0') {
		/* Skip the escape-char '\\' */
		i += 1;
		/* ... and check the next */
		switch (pattern[i]) {
		/* Meta-character: */
		case 'd': { re_compiled[j].type = DIGIT; } break;
		case 'D': { re_compiled[j].type = NOT_DIGIT; } break;
		case 'w': { re_compiled[j].type = ALPHA; } break;
		case 'W': { re_compiled[j].type = NOT_ALPHA; } break;
		case 's': { re_compiled[j].type = WHITESPACE; } break;
		case 'S': { re_compiled[j].type = NOT_WHITESPACE; } break;

		/* Escaped character, e.g. '.' or '$' */
		default: {
		re_compiled[j].type = CHAR;
		re_compiled[j].ch = pattern[i];
		} break;
		}
		}
		/* '\\' as last char in pattern -> invalid regular expression. */
		} break;

		/* Character class: */
		case '[': {
		/* Remember where the char-buffer starts. */
		int buf_begin = ccl_bufidx;

		/* Look-ahead to determine if negated */
		if (pattern[i+1] == '^') {
		re_compiled[j].type = INV_CHAR_CLASS;
		i += 1; /* Increment i to avoid including '^' in the char-buffer */
		} else {
		re_compiled[j].type = CHAR_CLASS;
		}

		/* Copy characters inside [..] to buffer */
		while ((pattern[++i] != ']') && (pattern[i] != '\0')) {
		/* Missing ] */
		if (pattern[i] == '\\') {
		if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1) {
		return 0;
		}
		ccl_buf[ccl_bufidx++] = pattern[i++];
		} else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) {
		return 0;
		}
		ccl_buf[ccl_bufidx++] = pattern[i];
		}
		if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) {
		/* Catches cases such as [00000000000000000000000000000000000000][ */
		return 0;
		}
		/* Null-terminate string end */
		ccl_buf[ccl_bufidx++] = 0;
		re_compiled[j].ccl = &ccl_buf[buf_begin];
		} break;

		/* Other characters: */
		default: {
		re_compiled[j].type = CHAR;
		re_compiled[j].ch = c;
		} break;
		}
		i += 1;
		j += 1;
		}
		/* 'UNUSED' is a sentinel used to indicate end-of-pattern */
		re_compiled[j].type = UNUSED;

		return (re_t) re_compiled;
		}


		/* Private functions: */
		static int matchdigit(char c)
		{
		return ((c >= '0') && (c <= '9'));
		}

		static int matchalpha(char c)
		{
		return ((c >= 'a') && (c <= 'z')) \|\| ((c >= 'A') && (c <= 'Z'));
		}

		static int matchwhitespace(char c)
		{
		return ((c == ' ') \|\| (c == '\t') \|\| (c == '\n') \|\| (c == '\r') \|\| (c == '\f') \|\| (c == '\v'));
		}

		static int matchalphanum(char c)
		{
		return ((c == '_') \|\| matchalpha(c) \|\| matchdigit(c));
		}

		static int matchrange(char c, const char *str)
		{
		return ((c != '-') && (str[0] != '\0')
		&& (str[0] != '-') && (str[1] == '-')
		&& (str[1] != '\0') && (str[2] != '\0')
		&& ((c >= str[0]) && (c <= str[2])));
		}

		static int ismetachar(char c)
		{
		return ((c == 's') \|\| (c == 'S')
		\|\| (c == 'w') \|\| (c == 'W')
		\|\| (c == 'd') \|\| (c == 'D'));
		}

		static int matchmetachar(char c, const char *str)
		{
		switch (str[0]) {
		case 'd': return matchdigit(c);
		case 'D': return !matchdigit(c);
		case 'w': return matchalphanum(c);
		case 'W': return !matchalphanum(c);
		case 's': return matchwhitespace(c);
		case 'S': return !matchwhitespace(c);
		default: return (c == str[0]);
		}
		}

		static int matchcharclass(char c, const char *str)
		{
		do {
		if (matchrange(c, str)) {
		return 1;
		} else if (str[0] == '\\') {
		/* Escape-char: increment str-ptr and match on next char */
		str += 1;
		if (matchmetachar(c, str)) {
		return 1;
		} else if ((c == str[0]) && !ismetachar(c)) {
		return 1;
		}
		} else if (c == str[0]) {
		if (c == '-') {
		return ((str[-1] == '\0') \|\| (str[1] == '\0'));
		} else {
		return 1;
		}
		}
		}
		while (*str++ != '\0');

		return 0;
		}

		static int matchone(regex_t p, char c)
		{
		switch (p.type) {
		case DOT: return 1;
		case CHAR_CLASS: return matchcharclass(c, (const char *)p.ccl);
		case INV_CHAR_CLASS: return !matchcharclass(c, (const char *)p.ccl);
		case DIGIT: return matchdigit(c);
		case NOT_DIGIT: return !matchdigit(c);
		case ALPHA: return matchalphanum(c);
		case NOT_ALPHA: return !matchalphanum(c);
		case WHITESPACE: return matchwhitespace(c);
		case NOT_WHITESPACE: return !matchwhitespace(c);
		default: return (p.ch == c);
		}
		}

		static int matchstar(regex_t p, regex_t pattern, const char text)
		{
		do {
		if (matchpattern(pattern, text))
		return 1;
		}
		while ((text[0] != '\0') && matchone(p, *text++));

		return 0;
		}

		static int matchplus(regex_t p, regex_t pattern, const char text)
		{
		while ((text[0] != '\0') && matchone(p, *text++)) {
		if (matchpattern(pattern, text))
		return 1;
		}
		return 0;
		}

		static int matchquestion(regex_t p, regex_t pattern, const char text)
		{
		if (p.type == UNUSED)
		return 1;
		if (matchpattern(pattern, text))
		return 1;
		if (text && matchone(p, text++))
		return matchpattern(pattern, text);
		return 0;
		}

		/* Iterative matching */
		static int matchpattern(regex_t pattern, const char text)
		{
		do {
		if ((pattern[0].type == UNUSED) \|\| (pattern[1].type == QUESTIONMARK)) {
		return matchquestion(pattern[0], &pattern[2], text);
		} else if (pattern[1].type == STAR) {
		return matchstar(pattern[0], &pattern[2], text);
		} else if (pattern[1].type == PLUS) {
		return matchplus(pattern[0], &pattern[2], text);
		} else if ((pattern[0].type == END) && pattern[1].type == UNUSED) {
		return (text[0] == '\0');
		}
		}
		while ((text[0] != '\0') && matchone(pattern++, text++));

		return 0;
		}
		}

src/utils.h

0 → 100644

+76 −0

Original line number	Diff line number	Diff line
		/* -- c++ -- ----------------------------------------------------------
		LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
		http://lammps.sandia.gov, Sandia National Laboratories
		Steve Plimpton, sjplimp@sandia.gov

		Copyright (2003) Sandia Corporation. Under the terms of Contract
		DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
		certain rights in this software. This software is distributed under
		the GNU General Public License.

		See the README file in the top-level LAMMPS directory.
		------------------------------------------------------------------------- */

		#ifndef LMP_UTILS_H
		#define LMP_UTILS_H

		/! \file utils.h /

		#include <string>
		#include <cstdio>

		namespace LAMMPS_NS {

		// forward declarations
		class Error;

		namespace utils {

		/** \brief Match text against a simplified regex pattern
		*
		* \param text the text to be matched against the pattern
		* \param pattern the search pattern, which may contain regexp markers
		* \return true if the pattern matches, false if not
		*/
		bool strmatch(std::string text, std::string pattern);

		/** Categories of special arguments for cfvarg() function
		*
		* Enum starts from 100 to avoid conflicts with other local define flags
		*/
		enum {NONE=100, /// does not match any category
		COMPUTE, /// processed a compute
		FIX, /// processed a fix
		VARIABLE /// processed a variable
		};

		/** \brief Convenience function to process 'c_', 'f_', and 'v_' arguments
		*
		* \param mode types to search for. 1-3 char string from 'c', 'f', or 'v'
		* \param arg argument string to test against the prefixes
		* \param cfv_id name or ID of the compute, fix, or variable
		* \return utils::COMPUTE, utils::FIX, utils::VARIABLE or utils::NONE
		*/
		int cfvarg(std::string mode, const char arg, char &cfv_id);

		/** \brief safe wrapper around fgets() which aborts on errors
		* or EOF and prints a suitable error message to help debugging
		*
		* \param srcname name of the calling source file (from FLERR macro)
		* \param srcline line in the calling source file (from FLERR macro)
		* \param s buffer for storing the result of fgets()
		* \param size size of buffer s (max number of bytes read by fgets())
		* \param fp file pointer used by fgets()
		* \param filename file name associated with fp (for error message)
		* \param error pointer to Error class instance (for abort)
		*/
		void sfgets(char* srcname, int srcline, char *s, int size,
		FILE fp, std::string filename, Error error);
		}
		}

		#endif

		/* ERROR/WARNING messages:

		*/

Admin message