~jojo/Carth

ref: 55fb4f948f1f3797078b584dc60b4f7dd68b37ed Carth/src/Lex.hs -rw-r--r-- 6.9 KiB
55fb4f94JoJo Check `cast` in Infer instead of Gen 4 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
{-# LANGUAGE DataKinds #-}

-- Note: Some parsers are greedy wrt consuming spaces and comments succeding the
--       item, while others are lazy. You'll have to look at the impl to be
--       sure.
--
--       If a parser has a variant with a "ns_" prefix, that variant does not
--       consume succeding space, while the unprefixed variant does.

module Lex (lex, toplevel, tokentree) where

import Control.Monad
import Control.Monad.Except
import Control.Monad.State
import Data.Char (isMark, isPunctuation, isSymbol)
import Data.Functor
import Data.Maybe
import Control.Applicative (liftA2)
import Text.Megaparsec hiding (parse, match)
import Text.Megaparsec.Char hiding (space, space1)
import qualified Text.Megaparsec.Char as Char
import qualified Text.Megaparsec.Char.Lexer as Lexer
import Data.Set (Set)
import qualified Data.Set as Set
import System.FilePath
import System.Directory
import Data.Void
import Prelude hiding (lex)

import Misc
import SrcPos
import Lexd
import Literate
import EnvVars


type Lexer = Parsec Void String

type Import = String

data TopLevel = TImport Import -- | TMacro Macro
    | TTokenTree TokenTree


lex :: FilePath -> ExceptT String IO [TokenTree]
lex filepath = do
    modPaths <- lift modulePaths
    filepath' <- lift $ makeAbsolute filepath
    evalStateT (lexModule modPaths filepath') Set.empty

-- NOTE: For the current implementation of macros where order of definition matters, it's
--       important that we visit imports and concatenate all token trees in the correct
--       order, which is DFS.
lexModule
    :: [FilePath] -> FilePath -> StateT (Set FilePath) (ExceptT String IO) [TokenTree]
lexModule modPaths f = get >>= \visiteds -> if Set.member f visiteds
    then pure []
    else do
        modify (Set.insert f)
        s <- liftIO $ readFile f <&> \s' ->
            if takeExtension f == ".org" then untangleOrg s' else s'
        (imps, tts) <- liftEither $ parse' toplevels f s
        let ps = takeDirectory f : modPaths
        let resolve m = do
                let gs = [ p </> addExtension m e | p <- ps, e <- [".carth", ".org"] ]
                gs' <- filterM (liftIO . doesFileExist) gs
                case listToMaybe gs' of
                    Nothing ->
                        throwError
                            $ ("Error: No file for module " ++ m ++ " exists.\n")
                            ++ ("Searched paths: " ++ show ps)
                    Just g' -> liftIO $ makeAbsolute g'
        impFs <- mapM resolve imps
        ttsImp <- fmap concat $ mapM (lexModule modPaths) impFs
        pure (ttsImp ++ tts)

toplevels :: Lexer ([Import], [TokenTree])
toplevels = do
    space
    tops <- many toplevel
    eof
    pure $ foldr
        (\top (is, tts) -> case top of
            TImport i -> (i : is, tts)
            TTokenTree tt -> (is, tt : tts)
        )
        ([], [])
        tops

toplevel :: Lexer TopLevel
toplevel = getSrcPos >>= \p -> parens
    (fmap TImport import' <|> fmap (TTokenTree . WithPos p . Parens) (many tokentree))
    where import' = keyword' "import" *> small

tokentree :: Lexer TokenTree
tokentree = do
    p <- getSrcPos
    tt <- tokentree'
    tt' <- option tt (ellipsis $> Ellipsis (WithPos p tt))
    pure (WithPos p tt')
  where
    tokentree' = choice
        [ fmap Small smallSpecial
        , fmap Big bigSpecial
        , fmap Keyword (try keyword)
        , fmap Small smallNormal
        , fmap Big bigNormal
        , fmap Lit lit
        , fmap Parens (parens (many tokentree))
        , fmap Brackets (brackets (many tokentree))
        , fmap Braces (braces (many tokentree))
        ]
    ellipsis = try (string "..." *> notFollowedBy identLetter *> space)
    lit = try num <|> fmap Str strlit
    num = andSkipSpaceAfter ns_num
    ns_num = do
        neg <- option False (char '-' $> True)
        a <- eitherP (try (ns_nat <* notFollowedBy (char '.'))) Lexer.float
        pure $ either ((\n -> Int (if neg then -n else n)) . fromIntegral)
                      (\x -> F64 (if neg then -x else x))
                      a
    ns_nat :: Lexer Word
    ns_nat = choice
        [string "0b" *> Lexer.binary, string "0x" *> Lexer.hexadecimal, Lexer.decimal]

strlit :: Lexer String
strlit = andSkipSpaceAfter ns_strlit
    where ns_strlit = char '"' >> manyTill Lexer.charLiteral (char '"')

keyword :: Lexer Keyword
keyword = andSkipSpaceAfter $ choice $ (++)
    (map
        (\p -> try (p <* notFollowedBy identLetter))
        [ string ":" $> Kcolon
        , string "." $> Kdot
        , string "Fun" $> KFun
        , string "Box" $> KBox
        , string "define" $> Kdefine
        , string "define:" $> KdefineColon
        , string "extern" $> Kextern
        , string "forall" $> Kforall
        , string "fmatch" $> Kfmatch
        , string "match" $> Kmatch
        , string "if" $> Kif
        , string "fun" $> Kfun
        , string "let1" $> Klet1
        , string "let" $> Klet
        , string "letrec" $> Kletrec
        , string "data" $> Kdata
        , string "sizeof" $> Ksizeof
        , string "import" $> Kimport
        , string "case" $> Kcase
        , string "defmacro" $> Kdefmacro
        ]
    )
    [string "id@" $> KidAt, string "Id@" $> KIdAt]

keyword' :: String -> Lexer ()
keyword' x = andSkipSpaceAfter $ label ("keyword " ++ x) (string x) $> ()

small, smallSpecial, smallNormal :: Lexer String
small = smallSpecial <|> smallNormal
smallSpecial = keyword' "id@" *> strlit
smallNormal = andSkipSpaceAfter $ liftA2 (:) smallStart identRest
  where
    smallStart = lowerChar <|> otherChar <|> try
        (oneOf ("-+" :: String) <* notFollowedBy digitChar)

bigSpecial, bigNormal :: Lexer String
bigSpecial = keyword' "id@" *> strlit
bigNormal = andSkipSpaceAfter $ liftA2 (:) bigStart identRest
    where bigStart = upperChar <|> char ':'

identRest :: Lexer String
identRest = many identLetter

identLetter :: Lexer Char
identLetter = letterChar <|> otherChar <|> oneOf ("-+:" :: String) <|> digitChar

otherChar :: Lexer Char
otherChar = satisfy
    (\c -> and
        [ any ($ c) [isMark, isPunctuation, isSymbol]
        , not (elem c ("()[]{}" :: String))
        , not (elem c ("\"-+:•" :: String))
        ]
    )

parens, ns_parens :: Lexer a -> Lexer a
parens = andSkipSpaceAfter . ns_parens
ns_parens = between (symbol "(") (string ")")

brackets, ns_brackets :: Lexer a -> Lexer a
brackets = andSkipSpaceAfter . ns_brackets
ns_brackets = between (symbol "[") (string "]")

braces, ns_braces :: Lexer a -> Lexer a
braces = andSkipSpaceAfter . ns_braces
ns_braces = between (symbol "{") (string "}")

andSkipSpaceAfter :: Lexer a -> Lexer a
andSkipSpaceAfter = Lexer.lexeme space

symbol :: String -> Lexer String
symbol = Lexer.symbol space

-- | Spaces, line comments, and block comments
space :: Lexer ()
space = Lexer.space Char.space1 (Lexer.skipLineComment ";") empty

getSrcPos :: Lexer SrcPos
getSrcPos = fmap
    (\(SourcePos f l c) ->
        SrcPos f (fromIntegral (unPos l)) (fromIntegral (unPos c)) Nothing
    )
    getSourcePos