You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
361 lines
12 KiB
361 lines
12 KiB
// Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
|
|
// Licensed under the MIT License:
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
|
|
// This file contains parsers useful for character stream inputs, including parsers to parse
|
|
// common kinds of tokens like identifiers, numbers, and quoted strings.
|
|
|
|
#ifndef KJ_PARSE_CHAR_H_
|
|
#define KJ_PARSE_CHAR_H_
|
|
|
|
#if defined(__GNUC__) && !KJ_HEADER_WARNINGS
|
|
#pragma GCC system_header
|
|
#endif
|
|
|
|
#include "common.h"
|
|
#include "../string.h"
|
|
#include <inttypes.h>
|
|
|
|
namespace kj {
|
|
namespace parse {
|
|
|
|
// =======================================================================================
|
|
// Exact char/string.
|
|
|
|
class ExactString_ {
|
|
public:
|
|
constexpr inline ExactString_(const char* str): str(str) {}
|
|
|
|
template <typename Input>
|
|
Maybe<Tuple<>> operator()(Input& input) const {
|
|
const char* ptr = str;
|
|
|
|
while (*ptr != '\0') {
|
|
if (input.atEnd() || input.current() != *ptr) return nullptr;
|
|
input.next();
|
|
++ptr;
|
|
}
|
|
|
|
return Tuple<>();
|
|
}
|
|
|
|
private:
|
|
const char* str;
|
|
};
|
|
|
|
constexpr inline ExactString_ exactString(const char* str) {
|
|
return ExactString_(str);
|
|
}
|
|
|
|
template <char c>
|
|
constexpr ExactlyConst_<char, c> exactChar() {
|
|
// Returns a parser that matches exactly the character given by the template argument (returning
|
|
// no result).
|
|
return ExactlyConst_<char, c>();
|
|
}
|
|
|
|
// =======================================================================================
|
|
// Char ranges / sets
|
|
|
|
class CharGroup_ {
|
|
public:
|
|
constexpr inline CharGroup_(): bits{0, 0, 0, 0} {}
|
|
|
|
constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const {
|
|
return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )),
|
|
bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)),
|
|
bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)),
|
|
bits[3] | (oneBits(last - 191) & ~oneBits(first - 192)));
|
|
}
|
|
|
|
constexpr inline CharGroup_ orAny(const char* chars) const {
|
|
return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1);
|
|
}
|
|
|
|
constexpr inline CharGroup_ orChar(unsigned char c) const {
|
|
return CharGroup_(bits[0] | bit(c),
|
|
bits[1] | bit(c - 64),
|
|
bits[2] | bit(c - 128),
|
|
bits[3] | bit(c - 256));
|
|
}
|
|
|
|
constexpr inline CharGroup_ orGroup(CharGroup_ other) const {
|
|
return CharGroup_(bits[0] | other.bits[0],
|
|
bits[1] | other.bits[1],
|
|
bits[2] | other.bits[2],
|
|
bits[3] | other.bits[3]);
|
|
}
|
|
|
|
constexpr inline CharGroup_ invert() const {
|
|
return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]);
|
|
}
|
|
|
|
constexpr inline bool contains(unsigned char c) const {
|
|
return (bits[c / 64] & (1ll << (c % 64))) != 0;
|
|
}
|
|
|
|
template <typename Input>
|
|
Maybe<char> operator()(Input& input) const {
|
|
if (input.atEnd()) return nullptr;
|
|
unsigned char c = input.current();
|
|
if (contains(c)) {
|
|
input.next();
|
|
return c;
|
|
} else {
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
private:
|
|
typedef unsigned long long Bits64;
|
|
|
|
constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {}
|
|
Bits64 bits[4];
|
|
|
|
static constexpr inline Bits64 oneBits(int count) {
|
|
return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1);
|
|
}
|
|
static constexpr inline Bits64 bit(int index) {
|
|
return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index);
|
|
}
|
|
};
|
|
|
|
constexpr inline CharGroup_ charRange(char first, char last) {
|
|
// Create a parser which accepts any character in the range from `first` to `last`, inclusive.
|
|
// For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the
|
|
// character matched.
|
|
//
|
|
// The returned object has methods which can be used to match more characters. The following
|
|
// produces a parser which accepts any letter as well as '_', '+', '-', and '.'.
|
|
//
|
|
// charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.")
|
|
//
|
|
// You can also use `.invert()` to match the opposite set of characters.
|
|
|
|
return CharGroup_().orRange(first, last);
|
|
}
|
|
|
|
#if _MSC_VER
|
|
#define anyOfChars(chars) CharGroup_().orAny(chars)
|
|
// TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from
|
|
// building the compiler or schema parser. We don't know why this happens, but Harris found that
|
|
// this horrible, horrible hack makes things work. This is awful, but it's better than nothing.
|
|
// Hopefully, MSVC will get fixed soon and we'll be able to remove this.
|
|
#else
|
|
constexpr inline CharGroup_ anyOfChars(const char* chars) {
|
|
// Returns a parser that accepts any of the characters in the given string (which should usually
|
|
// be a literal). The returned parser is of the same type as returned by `charRange()` -- see
|
|
// that function for more info.
|
|
|
|
return CharGroup_().orAny(chars);
|
|
}
|
|
#endif
|
|
|
|
// =======================================================================================
|
|
|
|
namespace _ { // private
|
|
|
|
struct ArrayToString {
|
|
inline String operator()(const Array<char>& arr) const {
|
|
return heapString(arr);
|
|
}
|
|
};
|
|
|
|
} // namespace _ (private)
|
|
|
|
template <typename SubParser>
|
|
constexpr inline auto charsToString(SubParser&& subParser)
|
|
-> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) {
|
|
// Wraps a parser that returns Array<char> such that it returns String instead.
|
|
return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString());
|
|
}
|
|
|
|
// =======================================================================================
|
|
// Basic character classes.
|
|
|
|
constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z');
|
|
constexpr auto digit = charRange('0', '9');
|
|
constexpr auto alphaNumeric = alpha.orGroup(digit);
|
|
constexpr auto nameStart = alpha.orChar('_');
|
|
constexpr auto nameChar = alphaNumeric.orChar('_');
|
|
constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F');
|
|
constexpr auto octDigit = charRange('0', '7');
|
|
constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v");
|
|
constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert();
|
|
|
|
constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v"));
|
|
|
|
constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v"))));
|
|
// Like discard(whitespace) but avoids some memory allocation.
|
|
|
|
// =======================================================================================
|
|
// Identifiers
|
|
|
|
namespace _ { // private
|
|
|
|
struct IdentifierToString {
|
|
inline String operator()(char first, const Array<char>& rest) const {
|
|
String result = heapString(rest.size() + 1);
|
|
result[0] = first;
|
|
memcpy(result.begin() + 1, rest.begin(), rest.size());
|
|
return result;
|
|
}
|
|
};
|
|
|
|
} // namespace _ (private)
|
|
|
|
constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString());
|
|
// Parses an identifier (e.g. a C variable name).
|
|
|
|
// =======================================================================================
|
|
// Integers
|
|
|
|
namespace _ { // private
|
|
|
|
inline char parseDigit(char c) {
|
|
if (c < 'A') return c - '0';
|
|
if (c < 'a') return c - 'A' + 10;
|
|
return c - 'a' + 10;
|
|
}
|
|
|
|
template <uint base>
|
|
struct ParseInteger {
|
|
inline uint64_t operator()(const Array<char>& digits) const {
|
|
return operator()('0', digits);
|
|
}
|
|
uint64_t operator()(char first, const Array<char>& digits) const {
|
|
uint64_t result = parseDigit(first);
|
|
for (char digit: digits) {
|
|
result = result * base + parseDigit(digit);
|
|
}
|
|
return result;
|
|
}
|
|
};
|
|
|
|
|
|
} // namespace _ (private)
|
|
|
|
constexpr auto integer = sequence(
|
|
oneOf(
|
|
transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()),
|
|
transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()),
|
|
transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())),
|
|
notLookingAt(alpha.orAny("_.")));
|
|
|
|
// =======================================================================================
|
|
// Numbers (i.e. floats)
|
|
|
|
namespace _ { // private
|
|
|
|
struct ParseFloat {
|
|
double operator()(const Array<char>& digits,
|
|
const Maybe<Array<char>>& fraction,
|
|
const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const;
|
|
};
|
|
|
|
} // namespace _ (private)
|
|
|
|
constexpr auto number = transform(
|
|
sequence(
|
|
oneOrMore(digit),
|
|
optional(sequence(exactChar<'.'>(), many(digit))),
|
|
optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))),
|
|
notLookingAt(alpha.orAny("_."))),
|
|
_::ParseFloat());
|
|
|
|
// =======================================================================================
|
|
// Quoted strings
|
|
|
|
namespace _ { // private
|
|
|
|
struct InterpretEscape {
|
|
char operator()(char c) const {
|
|
switch (c) {
|
|
case 'a': return '\a';
|
|
case 'b': return '\b';
|
|
case 'f': return '\f';
|
|
case 'n': return '\n';
|
|
case 'r': return '\r';
|
|
case 't': return '\t';
|
|
case 'v': return '\v';
|
|
default: return c;
|
|
}
|
|
}
|
|
};
|
|
|
|
struct ParseHexEscape {
|
|
inline char operator()(char first, char second) const {
|
|
return (parseDigit(first) << 4) | parseDigit(second);
|
|
}
|
|
};
|
|
|
|
struct ParseHexByte {
|
|
inline byte operator()(char first, char second) const {
|
|
return (parseDigit(first) << 4) | parseDigit(second);
|
|
}
|
|
};
|
|
|
|
struct ParseOctEscape {
|
|
inline char operator()(char first, Maybe<char> second, Maybe<char> third) const {
|
|
char result = first - '0';
|
|
KJ_IF_MAYBE(digit1, second) {
|
|
result = (result << 3) | (*digit1 - '0');
|
|
KJ_IF_MAYBE(digit2, third) {
|
|
result = (result << 3) | (*digit2 - '0');
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
};
|
|
|
|
} // namespace _ (private)
|
|
|
|
constexpr auto escapeSequence =
|
|
sequence(exactChar<'\\'>(), oneOf(
|
|
transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()),
|
|
transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()),
|
|
transform(sequence(octDigit, optional(octDigit), optional(octDigit)),
|
|
_::ParseOctEscape())));
|
|
// A parser that parses a C-string-style escape sequence (starting with a backslash). Returns
|
|
// a char.
|
|
|
|
constexpr auto doubleQuotedString = charsToString(sequence(
|
|
exactChar<'\"'>(),
|
|
many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)),
|
|
exactChar<'\"'>()));
|
|
// Parses a C-style double-quoted string.
|
|
|
|
constexpr auto singleQuotedString = charsToString(sequence(
|
|
exactChar<'\''>(),
|
|
many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)),
|
|
exactChar<'\''>()));
|
|
// Parses a C-style single-quoted string.
|
|
|
|
constexpr auto doubleQuotedHexBinary = sequence(
|
|
exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(),
|
|
oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())),
|
|
discardWhitespace,
|
|
exactChar<'\"'>());
|
|
// Parses a double-quoted hex binary literal. Returns Array<byte>.
|
|
|
|
} // namespace parse
|
|
} // namespace kj
|
|
|
|
#endif // KJ_PARSE_CHAR_H_
|
|
|