~siborgium/Regex

02a78ef06aaac89ccb9039eff848247705a6683c — Sergey Smirnykh 2 years ago 69cf084 trunk
Escape sequences parsing
1 files changed, 51 insertions(+), 20 deletions(-)

M Src/yyparser.y
M Src/yyparser.y => Src/yyparser.y +51 -20
@@ 279,6 279,48 @@ static size_t try_parse_class(const char* str, size_t n, const char** out) {
    return 0;
}

struct escape_seq_kv { char from; const char* to; };
struct escape_seq_kv escape_seq_map[] = {
    { 't', "\t" },
    { 'n', "\n" },
    { 'r', "\r" },
    { 'f', "\f" },
    { 'a', "\a" },
// Kdevelop doesn't know \e?
//    { 'e', '\e' },
    { 'b', "\b" }
};

static const char* try_parse_escape_seq(char in) {
    struct escape_seq_kv* cur = escape_seq_map;
    struct escape_seq_kv* end = escape_seq_map + sizeof(escape_seq_map) / sizeof(*escape_seq_map);
    for (; cur != end; ++cur) {
        if (in == cur->from) {
            return cur->to;
        }
    }
    return NULL;
}

static size_t try_parse_lit(const char* str, size_t n, Glyph* parsed) {
    mbstate_t state;
    memset(&state, '\0', sizeof(state));

    size_t r = mbrtoc32(
        parsed,
        str,
        n,
        &state
    );
    switch (r) {
        case (size_t)-1: assert(0 && "invalid input");
        case (size_t)-2: assert(0 && "truncated input");
        case (size_t)-3: assert(0 && "no surrogate pairs");
        default: break;
    }
    return r;
}

static int yylex(YYSTYPE* lvalp, struct location* locp, void* ctx, struct slice* src) {
    struct parser_context* pctx = ctx;
    const char** pos = &src->begin;


@@ 301,6 343,14 @@ static int yylex(YYSTYPE* lvalp, struct location* locp, void* ctx, struct slice*
                *pos += len;
                return TOKEN_CLASS;
            }
            const char* seq = try_parse_escape_seq(**pos);
            if (seq) {
                size_t seq_len = strlen(seq);
                size_t r = try_parse_lit(seq, seq_len, &lvalp->glyph);
                *pos += r;
                return TOKEN_LIT;
            }

            // rollback, so that we could re-parse it as glyph
            --(*pos);
            break;


@@ 321,29 371,10 @@ static int yylex(YYSTYPE* lvalp, struct location* locp, void* ctx, struct slice*
    }

    // TODO: provide escape/control sequences like \w \t \n \f etc

    // this is a lit
    mbstate_t state;
    memset(&state, '\0', sizeof(state));

    Glyph glyph;

    size_t r = mbrtoc32(
        &glyph,
        *pos,
        src->end - *pos,
        &state
    );
    switch (r) {
        case (size_t)-1: assert(0 && "invalid input");
        case (size_t)-2: assert(0 && "truncated input");
        case (size_t)-3: assert(0 && "no surrogate pairs");
        default: break;
    }
    size_t r = try_parse_lit(*pos, src->end - *pos, &lvalp->glyph);
    *pos += r;
    locp->pos = *pos;

    lvalp->glyph = glyph;
    return TOKEN_LIT;
}