~kf5jwc/imp-parser

ref: 4a8b0d0c4a9b3e235854c13031f981f55fc7ece5 imp-parser/imp_parser/lexer/__init__.py -rw-r--r-- 1.3 KiB View raw
4a8b0d0c — Kyle Jones More typing! 1 year, 7 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import sys
import re
from typing import List, Tuple, Type, Pattern
from enum import Enum
from . import exceptions
from .exceptions import NoMatchFound


# characters is an ~~iterable~~ indexable item which we can match against regex tokens, offered in token_exprs
def lex(characters, token_exprs) -> List[Tuple[str, Type[Enum]]]:
    position = 0
    ret_tokens = []
    exprs = precompile_tokens(token_exprs)
    chars_len = len(characters)

    while position < chars_len:
        ((token, tag), position) = find_next_expr(exprs, characters, position)
        if tag is not None:
            ret_tokens.append((token, tag))

    return ret_tokens


# Compile all expressions so we avoid doing it for each token
def precompile_tokens(given_tokens) -> List[Tuple[Pattern[str], Type[Enum]]]:
    tokens = []
    for expr, tag in given_tokens:
        tokens.append((re.compile(expr), tag))
    return tokens


def find_next_expr(expressions, characters, position):
    for pattern, tag in expressions:
        match = pattern.match(characters, position)
        if match is not None:
            return ((match.group(0), tag), match.end(0))

    raise NoMatchFound(
        "Illegal character at position {}: {}".format(position, characters[position])
    )


__all__ = ["lex", "exceptions"]