~thon/thon

thon/lex.sml -rw-r--r-- 8.5 KiB
2c9c65d6Evan Bergeron Parse SUCC | LET | CASE | FN 3 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
structure Lex : sig
datatype Token = FUN | FN | NAT | COLON | LPAREN | RPAREN | NAME of string | INDENT | DEDENT | RETURN | ZERO | SUCC | LET | SARROW | EQ | DARROW | IF | THEN | ELSE | DATA | BAR | CASE | COMMA | NEWLINE | UNIT | STAR
val lexFile : string -> Token list
val lexFileNoPrintErrMsg : string -> Token list
val tokenToString : Token -> string
end  =
struct

datatype Token = FUN | FN | NAT | COLON | LPAREN | RPAREN | NAME of string | INDENT | DEDENT | RETURN | ZERO | SUCC | LET | SARROW | EQ | DARROW | IF | THEN | ELSE | DATA | BAR | CASE | COMMA | NEWLINE | UNIT | STAR

exception No
exception UnexpectedIndentLevel
exception UnexpectedToken of string
exception UnimplementTokenToString

fun println s = print (s  ^ "\n")

fun debugPrint s =
    if false then println s
    else ()

fun tokenToString FUN = "FUN"
  | tokenToString FN = "FN"
  | tokenToString NAT = "NAT"
  | tokenToString COLON = "COLON"
  | tokenToString LPAREN = "LPAREN"
  | tokenToString RPAREN = "RPAREN"
  | tokenToString (NAME name) = "NAME " ^ name
  | tokenToString INDENT = "INDENT"
  | tokenToString DEDENT = "DEDENT"
  | tokenToString RETURN = "RETURN"
  | tokenToString ZERO = "ZERO"
  | tokenToString SUCC = "SUCC"
  | tokenToString LET = "LET"
  | tokenToString SARROW = "SARROW"
  | tokenToString EQ = "EQ"
  | tokenToString DARROW = "DARROW"
  | tokenToString IF = "IF"
  | tokenToString THEN = "THEN"
  | tokenToString ELSE = "ELSE"
  | tokenToString DATA = "DATA"
  | tokenToString BAR = "BAR"
  | tokenToString CASE = "CASE"
  | tokenToString COMMA = "COMMA"
  | tokenToString NEWLINE = "NEWLINE"
  | tokenToString UNIT = "UNIT"
  | tokenToString STAR = "STAR"

fun lookaheadN s n =
    (* Can raise Size *)
    let val st = TextIO.getInstream s
        val (n, tail) = TextIO.StreamIO.inputN (st, n);
    in n
    end

(* Get last char of lookahead *)
fun lookaheadOnlyN s n =
    let val st = TextIO.getInstream s
        val (k, tail) = TextIO.StreamIO.inputN (st, n);
        val () = debugPrint k
        val () = debugPrint (Int.toString n)
        val chars = explode k
        val res = (List.nth (chars, (List.length chars) - 1))
    in if n > (List.length chars) then NONE else SOME
        (debugPrint (Int.toString n); debugPrint (Char.toString res); res) end

fun getName' s n =
    (debugPrint ("getName " ^ (Int.toString n));
     case lookaheadOnlyN s n of
         NONE => lookaheadN s (n-1)
       | SOME c =>
         (if not (Char.isAlphaNum (c)) then
              lookaheadN s (n-1)
          else
              getName' s (n+1)))

fun getName s = getName' s 1

fun eatAndGetNumSpaces' s n =
    (debugPrint ("eatAndGetNumSpaces' " ^ (Int.toString n));
     case TextIO.lookahead s of
         SOME #" " => (
          TextIO.input1 s;
          eatAndGetNumSpaces' s (n+1)
         )
       | _ => n
    )

fun eatAndGetNumSpaces s = eatAndGetNumSpaces' s 0

fun eatWhitespace stream =
    case TextIO.lookahead stream of
        NONE => ()
      (* If I need to spit out NEWLINE tokens, feed `out` down here *)
      | SOME #"\n" => ((*TextIO.input1 stream*)(); ())
      | SOME c => if (Char.isSpace c)then
                  (TextIO.input1 stream; eatWhitespace stream)
                  else ()

fun onKeyword kw s =
    (debugPrint ("onKeyword " ^ kw);
    let val prefixOk = kw = (lookaheadN s (String.size kw))
        val afterChar = lookaheadOnlyN s ((String.size kw)+1)
        val suffixOk = not (Char.isAlphaNum (Option.valOf afterChar))
    in
        prefixOk andalso suffixOk
    end)

fun eatWord w s = (
    TextIO.inputN (s, (String.size w));
    eatWhitespace s
)

(* TODO lets 0 be a keyword, which is jank *)
fun eatKeywordOrName (w, tok) s indentLevel out =
    if onKeyword w s then (
        eatWord w s;
        lexLines' s (tok::out) indentLevel
    ) else (
        let val name = getName s in
            eatWord name s;
            lexLines' s ((NAME name)::out) indentLevel
        end
    )

and getIndentDedentTokens s out indentLevel =
    let
        (* TODO assert last elt of out is NEWLINE here *)
        val () = debugPrint ("Indent level: " ^ (Int.toString (!indentLevel)))
        val numSpaces = eatAndGetNumSpaces s
        (* UNDONE 2 space indent *)
        val thisLineIndentLevel = numSpaces div 4;
        val () = debugPrint ("thisLineIndentLevel " ^ (Int.toString (thisLineIndentLevel)))
        val tok = if thisLineIndentLevel > (!indentLevel) then INDENT else DEDENT
        val numToks = abs (thisLineIndentLevel - (!indentLevel))
        val toks = List.tabulate (numToks, fn _ => tok);
    in
        toks
    end

and lexLines' s out indentLevel =
    (debugPrint "=======================";
     List.map (fn tok => debugPrint (tokenToString tok)) out;
    case lookaheadN s 1 of
        "" => out
      | " " => raise No
      | "\n" => (
     (case lookaheadOnlyN s 2 of
         SOME #"\n" => (TextIO.input1 s;
                        lexLines' s (NEWLINE::out) indentLevel)
        | _ =>
          (TextIO.input1 s; (* can't eatWord here - keep leading spaces *)
        let val toks = getIndentDedentTokens s out indentLevel
        in
            if List.length toks = 0 then
                (* No indent or dedent here *)
                lexLines' s (NEWLINE::out) indentLevel
            else
                (indentLevel := !indentLevel +
                                ((List.length toks) *
                                 (if DEDENT = List.nth (toks, 0) then ~1 else 1))
                ; lexLines' s (toks @ (NEWLINE::out)) indentLevel)
        end)
        ))
      | "=" =>
        if onKeyword "=>" s then (
            eatWord "=>" s;
            lexLines' s (DARROW::out) indentLevel
        ) else if onKeyword "=" s then (
            eatWord "=" s;
            lexLines' s (EQ::out) indentLevel
        ) else (
            raise UnexpectedToken("saw `=`, expected `=>` or `=`")
        )
      | "*" => (
            eatWord "*" s;
            lexLines' s (STAR::out) indentLevel
          )
      | "u" => eatKeywordOrName ("unit", UNIT) s indentLevel out
      | "i" => eatKeywordOrName ("if", IF) s indentLevel out
      | "t" => eatKeywordOrName ("then", THEN) s indentLevel out
      | "e" => eatKeywordOrName ("else", ELSE) s indentLevel out
      | "d" => eatKeywordOrName ("data", DATA) s indentLevel out
      | "c" => eatKeywordOrName ("case", CASE) s indentLevel out
      (* TODO can just directly eat single chars *)
      | "," => eatKeywordOrName (",", COMMA) s indentLevel out
      | "|" => eatKeywordOrName ("|", BAR) s indentLevel out
      | "-" => eatKeywordOrName ("->", SARROW) s indentLevel out
      | "z" => eatKeywordOrName ("z", ZERO) s indentLevel out
      | "s" => eatKeywordOrName ("s", SUCC) s indentLevel out
      | "l" => eatKeywordOrName ("let", LET) s indentLevel out
      | "f" =>
        if onKeyword "fun" s then (
            eatWord "fun" s;
            lexLines' s (FUN::out) indentLevel
        ) else if onKeyword "fn" s then (
            eatWord "fn" s;
            lexLines' s (FN::out) indentLevel
        ) else (
            let val name = getName s in
                eatWord name s;
                lexLines' s ((NAME name)::out) indentLevel
            end
        )
      | "n" => eatKeywordOrName ("nat", NAT) s indentLevel out
      | "r" => eatKeywordOrName ("return", RETURN) s indentLevel out
      | "(" => (
          eatWord "(" s;
          lexLines' s (LPAREN::out) indentLevel
      )
      | ")" => (
          eatWord ")" s;
          lexLines' s (RPAREN::out) indentLevel
      )
      | ":" => (
          if lookaheadN s 2 = ":\n" then
              (eatWord ":" s;
               lexLines' s out indentLevel)
          else
              (eatWord ":" s;
               lexLines' s (COLON::out) indentLevel)
      )
      | other =>
        if not (Char.isAlpha (String.sub (other, 0))) then
            raise UnexpectedToken("See identifier starting with " ^ other ^ ". Indentifiers must start with alphabetic characters.")
        else
            let val name = getName s in
            eatWord name s;
            lexLines' s ((NAME name)::out) indentLevel
            end
      )

fun lexLines s indentLevel =
    let val backwards = lexLines' s [] indentLevel in List.rev backwards end

fun lex s printErrMsg =
    let
        val indentLevel = ref 0;
        val forewards = lexLines s indentLevel
    in
        forewards
    end
    handle UnexpectedToken msg => (if printErrMsg then print ("Lexing error: " ^ msg ^ "\n") else ();
                                   raise (UnexpectedToken msg) )

fun lexFile filename = lex (TextIO.openIn filename) true

fun lexFileNoPrintErrMsg filename = lex (TextIO.openIn filename) false


end