~jojo/Carth

3656b8bd137fe2aaa6abffea78d9a4cc5a68fd4b — JoJo 1 year, 1 month ago c37fee0
Begin work on lexer

I feel like adding a token(tree)izer for mainly two reasons.

1. Separation of concerns. Parsing code will be simpler if we can
   separate out the lexing part.

2. In preparation for macros. Macros should act on token trees, so
   creating an in-between step is pretty much needed.

Currently, Lex.hs and Lexd.hs work and feel pretty much done, but
they're not connected to the rest of the compiler. What remains is to
rewrite Parse.hs to act on TokenTree:s instead of parsing text from
scratch.
7 files changed, 263 insertions(+), 17 deletions(-)

M TODO.org
M app/Main.hs
M carth.cabal
A src/Lex.hs
A src/Lexd.hs
M src/Misc.hs
M src/Parse.hs
M TODO.org => TODO.org +2 -2
@@ 513,8 513,8 @@ Features and other stuff to do/implement in/around Carth.
  persistend data structures as well.

  - Priority queue
  - Binary tree
  - B-tree
  - Binary tree (2-3 tree better?)
  - B-tree (specifically 2-3 tree?)
  - Random number generator
  - bubble, insertion, selection sort
  - quicksort

M app/Main.hs => app/Main.hs +13 -0
@@ 4,10 4,12 @@ module Main (main) where

import System.Environment
import Control.Monad
import Prelude hiding (lex)

import Misc
import Pretty
import qualified Err
import qualified Lexd
import qualified Parsed
import qualified Checked
import Check


@@ 18,6 20,7 @@ import Monomorphize
import Optimize
import qualified Optimized as Ast
import qualified Parse
import qualified Lex
import EnvVars

main :: IO ()


@@ 52,6 55,9 @@ runFile cfg = do
frontend :: Config cfg => cfg -> FilePath -> IO Ast.Program
frontend cfg f = do
    let d = getDebug cfg
    verbose cfg ("   Lexing")
    cst <- lex f
    when d $ writeFile ".dbg.lexd" (show cst)
    verbose cfg ("   Parsing")
    ast <- parse f
    when d $ writeFile ".dbg.parsed" (pretty ast)


@@ 65,6 71,13 @@ frontend cfg f = do
    when d $ writeFile ".dbg.opt" (show opt)
    pure opt

lex :: FilePath -> IO [Lexd.TokenTree]
lex f = Lex.lex f >>= \case
    Left e -> putStrLn (formatParseErr e) >> abort f
    Right p -> pure p
  where
    formatParseErr e = let ss = lines e in (unlines ((head ss ++ " Error:") : tail ss))

parse :: FilePath -> IO Parsed.Program
parse f = Parse.parse f >>= \case
    Left e -> putStrLn (formatParseErr e) >> abort f

M carth.cabal => carth.cabal +2 -0
@@ 38,6 38,8 @@ library
      Monomorphize
      Optimize
      Optimized
      Lex
      Lexd
      Parse
      Parsed
      Pretty

A src/Lex.hs => src/Lex.hs +193 -0
@@ 0,0 1,193 @@
{-# LANGUAGE FlexibleContexts, LambdaCase, TupleSections, DataKinds #-}

module Lex where

import Control.Monad
import Control.Monad.Except
import Data.Char (isMark, isPunctuation, isSymbol)
import Data.Functor
import Data.Maybe
import Control.Applicative (liftA2)
import Text.Megaparsec hiding (parse, match)
import Text.Megaparsec.Char hiding (space, space1)
import qualified Text.Megaparsec.Char as Char
import qualified Text.Megaparsec.Char.Lexer as Lexer
import Data.Set (Set)
import qualified Data.Set as Set
import System.FilePath
import System.Directory

import Misc
import SrcPos
import Lexd
import Literate
import EnvVars


type Import = String

-- data Macro = Macro String deriving (Show)

data TopLevel = TImport Import -- | TMacro Macro
    | TTokenTree TokenTree


lex :: FilePath -> IO (Either String [TokenTree])
lex filepath = do
    modPaths <- modulePaths
    runExceptT (lexModules modPaths Set.empty [filepath])

lexModules :: [FilePath] -> Set String -> [String] -> ExceptT String IO [TokenTree]
lexModules modPaths visiteds = \case
    [] -> pure []
    f : nexts | Set.member f visiteds -> lexModules modPaths visiteds nexts
    f : nexts -> do
        s <- lift (readFile f)
            <&> \s' -> if takeExtension f == ".org" then untangleOrg s' else s'
        (imps, tts) <- liftEither $ parse' toplevels f s
        let ps = takeDirectory f : modPaths
        let resolve m = do
                let gs = [ p </> addExtension m e | p <- ps, e <- [".carth", ".org"] ]
                gs' <- filterM (lift . doesFileExist) gs
                case listToMaybe gs' of
                    Nothing ->
                        throwError
                            $ ("Error: No file for module " ++ m ++ " exists.\n")
                            ++ ("Searched paths: " ++ show ps)
                    Just g' -> pure g'
        impFs <- mapM resolve imps
        ttsNexts <- lexModules modPaths (Set.insert f visiteds) (impFs ++ nexts)
        pure (tts ++ ttsNexts)

toplevels :: Parser ([Import], [TokenTree])
toplevels = do
    space
    tops <- many toplevel
    eof
    pure $ foldr
        (\top (is, tts) -> case top of
            TImport i -> (i : is, tts)
            TTokenTree tt -> (is, tt : tts)
        )
        ([], [])
        tops

toplevel :: Parser TopLevel
toplevel = getSrcPos >>= \p -> parens
    (fmap TImport import' <|> fmap (TTokenTree . WithPos p . Parens) (many tokentree))
    where import' = keyword' "import" *> small

tokentree :: Parser TokenTree
tokentree = withPos tokentree'
  where
    tokentree' = choice
        [ fmap Small smallSpecial
        , fmap Big bigSpecial
        , fmap Keyword (try keyword)
        , fmap Small smallNormal
        , fmap Big bigNormal
        , fmap Lit lit
        , fmap Parens (parens (many tokentree))
        , fmap Brackets (brackets (many tokentree))
        , fmap Braces (braces (many tokentree))
        ]
    lit = try num <|> fmap Str strlit
    num = andSkipSpaceAfter ns_num
    ns_num = do
        neg <- option False (char '-' $> True)
        a <- eitherP (try (ns_nat <* notFollowedBy (char '.'))) Lexer.float
        pure $ either ((\n -> Int (if neg then -n else n)) . fromIntegral)
                      (\x -> F64 (if neg then -x else x))
                      a
    -- ns_int = option id (char '-' $> negate) >>= \f -> fmap (f . fromIntegral) ns_nat
    ns_nat :: Parser Word
    ns_nat = choice
        [string "0b" *> Lexer.binary, string "0x" *> Lexer.hexadecimal, Lexer.decimal]

strlit :: Parser String
strlit = andSkipSpaceAfter ns_strlit
    where ns_strlit = char '"' >> manyTill Lexer.charLiteral (char '"')

keyword :: Parser Keyword
keyword = andSkipSpaceAfter $ choice
    [ string ":" $> Kcolon
    , string "." $> Kdot
    , string "Fun" $> KFun
    , string "Box" $> KBox
    , string "define" $> Kdefine
    , string "define:" $> KdefineColon
    , string "extern" $> Kextern
    , string "forall" $> Kforall
    , string "fmatch" $> Kfmatch
    , string "match" $> Kmatch
    , string "if" $> Kif
    , string "fun" $> Kfun
    , string "let1" $> Klet1
    , string "let" $> Klet
    , string "letrec" $> Kletrec
    , string "data" $> Kdata
    , string "sizeof" $> Ksizeof
    , string "import" $> Kimport
    , string "case" $> Kcase
    , string "id@" $> KidAt
    , string "Id@" $> KIdAt
    ]

keyword' :: String -> Parser ()
keyword' x = andSkipSpaceAfter $ label ("keyword " ++ x) (string x) $> ()

small, smallSpecial, smallNormal :: Parser String
small = smallSpecial <|> smallNormal
smallSpecial = keyword' "id@" *> strlit
smallNormal = andSkipSpaceAfter $ liftA2 (:) smallStart identRest
  where
    smallStart = lowerChar <|> otherChar <|> try (oneOf "-+" <* notFollowedBy digitChar)

big, bigSpecial, bigNormal :: Parser String
big = bigSpecial <|> bigNormal
bigSpecial = keyword' "id@" *> strlit
bigNormal = andSkipSpaceAfter $ liftA2 (:) bigStart identRest
    where bigStart = upperChar <|> char ':'

identRest :: Parser String
identRest = many identLetter
    where identLetter = letterChar <|> otherChar <|> oneOf "-+:" <|> digitChar

otherChar :: Parser Char
otherChar = satisfy
    (\c -> and
        [ any ($ c) [isMark, isPunctuation, isSymbol]
        , not (elem c "()[]{}")
        , not (elem c "\"-+:")
        ]
    )

parens, ns_parens :: Parser a -> Parser a
parens = andSkipSpaceAfter . ns_parens
ns_parens = between (symbol "(") (string ")")

brackets, ns_brackets :: Parser a -> Parser a
brackets = andSkipSpaceAfter . ns_brackets
ns_brackets = between (symbol "[") (string "]")

braces, ns_braces :: Parser a -> Parser a
braces = andSkipSpaceAfter . ns_braces
ns_braces = between (symbol "{") (string "}")

andSkipSpaceAfter :: Parser a -> Parser a
andSkipSpaceAfter = Lexer.lexeme space

symbol :: String -> Parser String
symbol = Lexer.symbol space

-- | Spaces, line comments, and block comments
space :: Parser ()
space = Lexer.space Char.space1 (Lexer.skipLineComment ";") empty

withPos :: Parser a -> Parser (WithPos a)
withPos = liftA2 WithPos getSrcPos

getSrcPos :: Parser SrcPos
getSrcPos = fmap
    (\(SourcePos f l c) -> SrcPos f (fromIntegral (unPos l)) (fromIntegral (unPos c)))
    getSourcePos

A src/Lexd.hs => src/Lexd.hs +37 -0
@@ 0,0 1,37 @@
{-# LANGUAGE LambdaCase, TypeSynonymInstances, FlexibleInstances
           , MultiParamTypeClasses, KindSignatures, DataKinds
           , DeriveDataTypeable #-}

module Lexd where

import SrcPos

data Const
    = Int Int
    | F64 Double
    | Str String
    deriving (Show, Eq)

data Keyword
    = Kcolon | Kdot
    | Kforall | KFun | KBox
    | Kdefine | KdefineColon
    | Kimport | Kextern | Kdata
    | Kfmatch | Kmatch | Kcase
    | Kif | Kfun
    | Klet1 | Klet | Kletrec
    | Ksizeof
    | KidAt | KIdAt
    deriving (Show)

data TokenTree'
    = Lit Const
    | Small String
    | Big String
    | Keyword Keyword
    | Parens [TokenTree]
    | Brackets [TokenTree]
    | Braces [TokenTree]
    deriving (Show)

type TokenTree = WithPos TokenTree'

M src/Misc.hs => src/Misc.hs +9 -0
@@ 6,6 6,7 @@ import Data.List (intercalate)
import qualified Data.Map as Map
import Data.Map (Map)
import Data.Maybe
import Data.Bifunctor
import Control.Monad.Reader
import Control.Monad.Writer
import Control.Monad.State


@@ 17,9 18,14 @@ import Text.Megaparsec hiding (parse, match)
import Text.Megaparsec.Char hiding (space, space1)
import Data.Void


newtype TopologicalOrder a = Topo [a]
    deriving Show

type Parser = Parsec Void String
type Source = String


ice :: String -> a
ice = error . ("Internal Compiler Error: " ++)



@@ 86,6 92,9 @@ splitOn sep = fromMaybe [] . Mega.parseMaybe splitOn'
        a <- many anySingle
        pure $ (as ++) $ if not (null a) then [a] else []

parse' :: Parser a -> FilePath -> Source -> Either String a
parse' p name src = first errorBundlePretty (Mega.parse p name src)

partitionWith :: (a -> Either b c) -> [a] -> ([b], [c])
partitionWith f = foldr
    (\x (bs, cs) -> case f x of

M src/Parse.hs => src/Parse.hs +7 -15
@@ 12,7 12,6 @@ module Parse (Parser, Source, parse, parse', parseTokenTreeOrRest, toplevels) wh
import Control.Monad
import Data.Char (isMark, isPunctuation, isSymbol, isUpper)
import Data.Functor
import Data.Bifunctor
import Data.Maybe
import Control.Applicative (liftA2)
import qualified Text.Megaparsec as Mega


@@ 22,7 21,6 @@ import qualified Text.Megaparsec.Char as Char
import qualified Text.Megaparsec.Char.Lexer as Lexer
import Data.Set (Set)
import qualified Data.Set as Set
import Data.Void
import Data.List
import System.FilePath
import System.Directory


@@ 34,8 32,7 @@ import Parsed
import Literate
import EnvVars

type Parser = Parsec Void String
type Source = String

type Import = String




@@ 91,9 88,6 @@ parseModule filepath dir m visiteds nexts =
                    Left e -> pure (Left e)
                    Right r -> advance r

parse' :: Parser a -> FilePath -> Source -> Either String a
parse' p name src = first errorBundlePretty (Mega.parse p name src)

-- | For use in module TypeErr to get the length of the tokentree to draw a
--   squiggly line under it.
parseTokenTreeOrRest :: Source -> Either String String


@@ 265,6 259,10 @@ ns_num = do
                   a
    pure (Lit e)

int :: Parser Int
int = andSkipSpaceAfter $ option id (char '-' $> negate) >>= \f ->
    fmap (f . fromIntegral) ns_nat

ns_nat :: Parser Word
ns_nat =
    choice [string "0b" *> Lexer.binary, string "0x" *> Lexer.hexadecimal, Lexer.decimal]


@@ 307,8 305,8 @@ nonptype = choice
  where
    tprim = try $ andSkipSpaceAfter
        (choice
                [ string "Nat" *> (fmap TNat ns_word <|> pure TNatSize)
                , string "Int" *> (fmap TInt ns_word <|> pure TIntSize)
                [ string "Nat" *> (fmap TNat Lexer.decimal <|> pure TNatSize)
                , string "Int" *> (fmap TInt Lexer.decimal <|> pure TIntSize)
                , string "F16" $> TF16
                , string "F32" $> TF32
                , string "F64" $> TF64


@@ 353,12 351,6 @@ brackets = andSkipSpaceAfter . ns_brackets
ns_brackets :: Parser a -> Parser a
ns_brackets = between (symbol "[") (string "]")

int :: Num a => Parser a
int = andSkipSpaceAfter (Lexer.signed empty ns_word)

ns_word :: Num a => Parser a
ns_word = Lexer.decimal

big :: Parser (Id 'Big)
big = fmap Id (special <|> normal)
  where