-- -- file: Parser.lhs -- -- created: 27/11/96 by Andrew Rock. -- -- purpose: This defines the Parser function type, some basic parsers and the -- parser combinators to build new parsers. -- -- This version is based on Parser.hs, but can not cope with -- ambiguous grammars, however it can report errors. -- > module Parser (Line, Col, Pos, Msg, Could(Fail, Error, OK), > Analyser, Lexer, Parser, Tag, Lexeme, > preLex, succeedA, epsilonA, failA, errorA, > satisfyL, literalL, tagP, literalP, > ( <|> ), ( <*> ), ( *> ), ( <* ), ( @> ), ( #> ), > some, many, optional, total, nofail, nofail', ( %> ), > ( <**> ), ( <++> ), ( *%> ), soft, tagFilter, > spaceL, tabL, vertabL, formfeedL, newlineL, whitespaceL, > dropWhite, manyUntil, someUntil, alsoSat, alsoNotSat, > dataSatisfies, cardinalL, tokenL, listL, errMsg, cons > ) where > import Char > infixr 6 <*>, *>, <*, <**>, <++> > infixr 5 @>, #>, %>, *%> > infixr 4 <|> ===================== Positions in a source ===================== To report error the position of a character or token in a source is required. > type Line = Int -- counting lines from 0 > type Col = Int -- counting columms from 0 > type Pos = (Line, Col) -- position in a source N.B. A negative line value indicates "don't know where." ========= Analysers ========= An Analyser is a higher-level abstraction of both lexers and parsers. An analyser could succeed, fail or generate an error. This type wraps around any other type to indicate success, failure or an immediately identifiable error. Failure or error values return a diagnostic message and a position in the source. > type Msg = String > data Could a = Fail Pos Msg | Error Pos Msg | OK a > deriving Show An Analyser is a functions that tries to accept a list of inputs of type "a" with their positions, and return a value constructed from consumed inputs of type "b", a nominal position to go with that value and any unconsumed inputs. Alternately it could fail or generate an error. > type Analyser a b = [(a,Pos)] -> Could (b, [(a,Pos)]) ================ Preparing to lex ================ Before lexing can take place, the locations must be added. > preLex :: [Char] -> [(Char,Pos)] > preLex = pl (0,0) > where > pl _ [] = [] > pl (line,column) ('\n':cs) > = ('\n', (line, column)) : pl (line + 1, 0) cs > pl (line,column) ('\t':cs) > = ('\t', (line,column)) : pl (line, (column `div` 8 + 1) * 8) cs > pl (line,column) (c:cs) > = (c, (line,column)) : pl (line, column+1) cs ====== Lexing ====== Lexing is the process of breaking the input stream of characters up into a stream of lexemes (tokens). > type Lexeme = String Each lexeme will have been identified as belonging to one of an expected set of classes of lexemes. This information will be passed from a lexer to a parser by use of a Tag. > type Tag = String The input to a lexer function is a list of characters and their positions. The output from a lexer is a list of lexemes with their tags and positions and the list of unconsumed characters and their positions. The output could also be an error or failure msg. > type Lexer = Analyser Char [((Tag,Lexeme),Pos)] ======= Parsing ======= Parsing is the process of transforming the stream of Lexemes into a parse tree. The input to a parser is the list of lexemes with their tags and positions. The output from a parser is the parse tree and the list of unconsumed lexemes, or failure or error. > type Parser a = Analyser (Tag,Lexeme) a In the above definition, "a" is the type of the parse tree. ==================== Elementary Analysers ==================== These functions may be used as either lexers or parsers This is the simplest Analyser. It succeeds with a predetermined value v and does not consume any input. > succeedA :: b -> Analyser a b > succeedA v xs = OK (v, xs) This is a trivial case of succeedA. > epsilonA :: Analyser a () > epsilonA = succeedA () This always fails with a diagnostic message and the position of the next lexeme returned. > failA :: Msg -> Analyser a b > failA msg [] = Fail (-1,-1) msg > failA msg ((_,pos):_) = Fail pos msg This always returns an error with a diagnostic message and the position of the next lexeme. > errorA :: Msg -> Analyser a b > errorA msg [] = Error (-1,-1) msg > errorA msg ((_,pos):_) = Error pos msg ================= Elementary Lexers ================= This succeeds if the first input character passes test p. On success the Lexeme returned is the string containing just that character and the Tag returned is wantTag. On failure the Msg "wantTag expected" is returned. > satisfyL :: (Char -> Bool) -> Tag -> Lexer > satisfyL p wantTag [] = failA (wantTag ++ " expected") [] > satisfyL p wantTag ((c,pos):cps) > | p c = succeedA [((wantTag, [c]), pos)] cps > | otherwise = failA (wantTag ++ " expected") ((c,pos):cps) This succeeds if the first input is wantChar. On success the Lexeme returned is [wantChar] with the tag "'wantChar'" On failure the Msg "'wantChar' expected" is returned. > literalL :: Char -> Lexer > literalL wantChar = satisfyL (== wantChar) ('\'' : wantChar : "'") ================== Elementary Parsers ================== This succeeds if the first lexeme has Tag wantTag. On failure, the Msg returned is "wantTag expected". > tagP :: Tag -> Parser (Tag,Lexeme,Pos) > tagP wantTag [] = failA (wantTag ++ " expected") [] > tagP wantTag (((tag,lexeme),pos):xs) > | wantTag == tag = succeedA (tag,lexeme,pos) xs > | otherwise = failA (wantTag ++ " expected") (((tag,lexeme),pos):xs) This succeeds if the first lexeme has Tag wantTag and Lexeme wantLex. On failure, the Msg returned is "wantTag "wantLex" expected". > literalP :: Tag -> Lexeme -> Parser (Tag,Lexeme,Pos) > literalP wantTag wantLex [] > = failA (wantTag ++ " \"" ++ wantLex ++ "\" expected") [] > literalP wantTag wantLex (((tag,lexeme),pos):xs) > | wantTag == tag && wantLex == lexeme > = succeedA (tag,lexeme,pos) xs > | otherwise > = failA (wantTag ++ "\"" ++ wantLex ++ "\" expected") > (((tag,lexeme),pos):xs) =============================== Elementary Analyser combinators =============================== These allow the composition of analysers, lexers and parsers. The alternation combinator. Returns the result of one analyser or another if the first one fails. The order of the analyser arguments is very significant. The first success or error will be returned. > ( <|> ) :: Analyser a b -> Analyser a b -> Analyser a b > ( <|> ) a1 a2 xs > = case (a1 xs) of > Fail pos msg -> a2 xs > Error pos msg -> Error pos msg > OK stuff -> OK stuff Sequence combinator. > ( <*> ) :: Analyser a b -> Analyser a c -> Analyser a (b,c) > ( <*> ) a1 a2 xs > = case (a1 xs) of > Fail pos1 msg1 -> Fail pos1 msg1 > Error pos1 msg1 -> Error pos1 msg1 > OK (v1,ys) -> case (a2 ys) of > Fail pos2 msg2 -> Fail pos2 msg2 > Error pos2 msg2 -> Error pos2 msg2 > OK (v2,zs) -> OK ((v1,v2),zs) =============== Value modifiers =============== These modify an Analyser by modifying the type of value it returns. Using modifier. Changes the type of value returned by an Analyser a by applying function f to it. > ( @> ) :: Analyser a b -> (b -> c) -> Analyser a c > ( @> ) a f xs > = case (a xs) of > Fail pos msg -> Fail pos msg > Error pos msg -> Error pos msg > OK (v,ys) -> OK (f v, ys) Replacement modifier. Replaces the value with v. > ( #> ) :: Analyser a b -> c -> Analyser a c > a #> v = a @> const v ========================= More Analyser combinators ========================= This definition of cons as an uncurried form of : is used below. > cons :: (a,[a]) -> [a] > cons = uncurry (:) {}+ combinator. > some :: Analyser a b -> Analyser a [b] > some a = a <*> many a @> cons {} combinator. > many :: Analyser a b -> Analyser a [b] > many a = some a <|> succeedA [] This is the same as some but stops consuming input when a second analyser would work. > someUntil :: Analyser a b -> Analyser a c -> Analyser a [b] > someUntil a1 a2 input > = case a2 input of > OK _ -> failA "unexpected text" input > Fail _ _ -> ((a1 <*> manyUntil a1 a2) @> cons) input > Error _ _ -> ((a1 <*> manyUntil a1 a2) @> cons) input This is the same as many but stops consuming input when a second analyser would work. > manyUntil :: Analyser a b -> Analyser a c -> Analyser a [b] > manyUntil a1 a2 = someUntil a1 a2 <|> succeedA [] [] combinator. > optional :: Analyser a b -> Analyser a [b] > optional a = a @> (\x -> [x]) <|> succeedA [] Same as <*>, but discards first value. > ( *> ) :: Analyser a b -> Analyser a c -> Analyser a c > a1 *> a2 = a1 <*> a2 @> snd Same as <*>, but discards second value. > ( <* ) :: Analyser a b -> Analyser a c -> Analyser a b > a1 <* a2 = a1 <*> a2 @> fst This permits Analyser a1 to succeed and consume the input only if a2 would also succeed. > alsoSat :: Analyser a b -> Analyser a c -> Analyser a b > alsoSat a1 a2 input > = case a2 input of > OK _ -> a1 input > Fail msg pos -> Fail msg pos > Error msg pos -> Error msg pos This permits Analyser a1 to succeed and consume input only if a2 would not succeed. > alsoNotSat :: Analyser a b -> Analyser a c -> Analyser a b > alsoNotSat a1 a2 [] > = Fail (-1,-1) "end of input" > alsoNotSat a1 a2 ((x,p):xps) > = case a2 ((x,p):xps) of > OK _ -> Fail p "unexpected text" > Fail pos msg -> a1 ((x,p):xps) > Error pos msg -> a1 ((x,p):xps) This permits Analyser a to succeed and consume input only if an auxillary test performed on the data returned by the analyser returns true. > dataSatisfies :: Analyser a b -> (b -> Bool) -> Analyser a b > dataSatisfies _ _ [] > = Fail (-1,-1) "end of input" > dataSatisfies a test ((x,p):xps) > = let result = a ((x,p):xps) in > case result of > OK (dat, _) -> if test dat then result > else Fail p "unexpected text" > Fail pos msg -> Fail pos msg > Error pos msg -> Error pos msg Forces an Analyser to fail if it can't consume all the inputs. > total :: Analyser a b -> Analyser a b > total a xs > = case (a xs) of > Error pos msg -> Error pos msg > Fail pos msg -> Fail pos msg > OK (v,[]) -> OK (v,[]) > OK (v,((x,pos):_)) -> Fail pos "unexpected text" Forces an Analyser to return an error instead of merely failing. > nofail :: Analyser a b -> Analyser a b > nofail a xs > = case (a xs) of > Error pos msg -> Error pos msg > Fail pos msg -> Error pos msg > OK stuff -> OK stuff This version forces an Analyser to return an error instead of merely failing and overrides the error message with msg'. > nofail' :: Msg -> Analyser a b -> Analyser a b > nofail' msg' a xs > = case (a xs) of > Error pos msg -> Error pos msg > Fail pos msg -> Error pos msg' > OK stuff -> OK stuff ============================== Special combinators for Lexers ============================== This overrides the Tag produced by a Lexer with newTag. > ( %> ) :: Lexer -> Tag -> Lexer > l %> newTag = l @> (\[((tag,lexeme),pos)] -> [((newTag,lexeme),pos)]) This is the hard sequencing combinator for Lexers. The tag returned is the space-separated catenation of the two tags, the Lexeme returned is the catenation of the two lexemes, and the Pos returned is the first Pos. > ( <**> ) :: Lexer -> Lexer -> Lexer > l1 <**> l2 > = (l1 <*> l2) @> f > where > f (x,[]) = x > f ([],x) = x > f ([((t,l),p)],[((t',l'),p')]) = [((t ++ " " ++ t', l ++ l'), p)] This is the soft sequencing combinator for Lexers. The combined Lexer returns the catenation of the lists of lexemes produced by exch Lexer. > ( <++> ) :: Lexer -> Lexer -> Lexer > l1 <++> l2 = l1 <*> l2 @> (uncurry (++)) This permits the use of the combinators some, many and optional (above) by hard catenating the Lexemes and overriding the Tag. The Pos returned is the first. > ( *%> ) :: Analyser Char [[((Tag,Lexeme),Pos)]] -> Tag -> Lexer > a *%> newTag > = (a @> f) %> newTag > where > f xs = [(("", concat (map (snd.fst.head) xs)), (snd.head.head) xs)] This permits the use of the combinators some, many and optional (above) by soft catenating the Lexemes. > soft :: Analyser Char [[((Tag,Lexeme),Pos)]] -> Lexer > soft = ( @> concat) This modifies a lexer by making it throw out lexemes with a specified Tag. > tagFilter :: Tag -> Lexer -> Lexer > tagFilter dropTag = ( @> (filter ((/= dropTag).fst.fst) ) ) ======================= Dealing with whitespace ======================= Lexers to recognise individual whitespace characters. > spaceL :: Lexer > spaceL = literalL ' ' > tabL :: Lexer > tabL = literalL '\t' > newlineL :: Lexer > newlineL = literalL '\n' > vertabL :: Lexer > vertabL = literalL '\v' > formfeedL :: Lexer > formfeedL = literalL '\f' Lexer to recognise any amount of whitespace. Tag returned is " ". > whitespaceL :: Lexer > whitespaceL = some (spaceL <|> tabL <|> newlineL <|> vertabL <|> formfeedL) > *%> " " Lexer modifier to drop any whitespace that has been recognised by witespaceL. > dropWhite :: Lexer -> Lexer > dropWhite = tagFilter " " ==================== Recognising a number ==================== > cardinalL :: Lexer > cardinalL = some (satisfyL isDigit "digit") *%> "cardinal" =================== Recognising a Token =================== This is a Lexer that recognises a token. > tokenL :: String -> Lexer > tokenL [] = succeedA [] > tokenL (c:cs) = literalL c <**> (tokenL cs) ======================================== Building a Lexer out of a list of Lexers ======================================== This is a common lexical structure for sources: source ::= {lexeme1 | lexeme2 | ... } This function builds a Lexer for source out of a list of Lexers: [lexeme1L, lexeme2L, ... ] > listL :: [Lexer] -> Lexer > listL = soft . many . (foldr (<|>) (failA "unexpected text")) =============== Error reporting =============== This generates an error message. > errMsg :: Pos -> Msg -> String -> String > errMsg (-1,_) msg source > = errMsg (sl , length (sourcelines !! sl) + 1) msg source > where > sourcelines = lines source > sl = length sourcelines -1 > errMsg _ msg [] > = "Error: " ++ msg ++ ".\n" > errMsg (line,col) msg source > = "Error on line " ++ show (line + 1) ++ ": " ++ msg ++ ".\n" ++ > displaylines ++ > take (col') (repeat ' ') ++ "^\n" ++ > take (col') (repeat ' ') ++ "|\n" > where > col' | col > 0 = col > | otherwise = 0 > sourcelines = lines source > displaylines > | line == 0 = head sourcelines ++ "\n" > | line == 1 = unlines (take 2 sourcelines) > | otherwise = (unlines . (take 3) . (drop (line-2))) sourcelines