~siborgium/Regex

69cf084b269414c7f324a972cfe0171ec09d45fa — Sergey Smirnykh 2 years ago be44956
Classes parsing stub
1 files changed, 47 insertions(+), 21 deletions(-)

M Src/yyparser.y
M Src/yyparser.y => Src/yyparser.y +47 -21
@@ 15,10 15,11 @@
#include <slice.h>
#include <ast.h>
#include <parser_context.h>
#include <stdio.h>

struct location {
    Byte* pos;
    Byte* end;
    const Byte* pos;
    const Byte* end;
};

#define YYLLOC_DEFAULT(Cur, Rhs, N)                                          \


@@ 53,12 54,13 @@ struct location {
    struct set_piece set_elem;
    Glyph            glyph;
    char             repeat;
    const char*      class_;
}

%token          TOKEN_EOF
%token <glyph>  TOKEN_LIT
%token <repeat> TOKEN_REPEAT
%token <class>  TOKEN_CLASS
%token <class_> TOKEN_CLASS

%type  <node>     regex seq expr plain_expr repeatable_expr repeated_expr or atom
%type  <set>      set set_expr_list


@@ 240,29 242,44 @@ set_expr_list: set_expr
             }
             ;

class: '\\' class_
class: TOKEN_CLASS
     {
         fprintf(stderr, "Got class: %s\n", $1);
     }
     ;

class_: 'a'
      | 'b'
      | 'B'
      | 'e'
      | 'f'
      | 'v'
      | 'd'
      | 'D'
      | 's'
      | 'S'
      | 'w'
      | 'W'
      ;


%%

#include <stdio.h>

int yylex(YYSTYPE* lvalp, struct location* locp, void* ctx, struct slice* src) {
const char* known_classes[] = {
    "e",
    "v",
    "d",
    "D",
    "s",
    "S",
    "w",
    "W",
    "N"
};

static size_t try_parse_class(const char* str, size_t n, const char** out) {
    const char** cur = known_classes;
    const char** end = known_classes + sizeof(known_classes) / sizeof(*known_classes);
    size_t       cur_len;
    for (; cur != end; ++cur) {
        cur_len = strlen(*cur);
        if (cur_len <= n && strncmp(*cur, str, cur_len) == 0) {
            *out = *cur;
            return cur_len;
        }
    }
    return 0;
}

static int yylex(YYSTYPE* lvalp, struct location* locp, void* ctx, struct slice* src) {
    struct parser_context* pctx = ctx;
    const char** pos = &src->begin;



@@ 272,15 289,22 @@ int yylex(YYSTYPE* lvalp, struct location* locp, void* ctx, struct slice* src) {
        return YYEOF;
    }

    int escaped = 0;
    switch (**pos) {
        case '\\':
            escaped = 1;
        {
            ++(*pos);
            if (*pos == src->end) {
                return YYEOF;
            }
            size_t len = try_parse_class(*pos, src->end - *pos, &lvalp->class_);
            if (len) {
                *pos += len;
                return TOKEN_CLASS;
            }
            // rollback, so that we could re-parse it as glyph
            --(*pos);
            break;
        }
        case '|': ++(*pos); return '|';
        case '+':
        case '*':


@@ 296,6 320,8 @@ int yylex(YYSTYPE* lvalp, struct location* locp, void* ctx, struct slice* src) {
        default: break;
    }

    // TODO: provide escape/control sequences like \w \t \n \f etc

    // this is a lit
    mbstate_t state;
    memset(&state, '\0', sizeof(state));