Packrat parser with left recursion

Traditional bottom-up LALR(1) parsers like yacc are both complex to implement and limited in what they can parse. In contrast, top-down packrat parsers (good Wikipedia intro) are simple to write from scratch — you can write a parser in a few dozen lines of code without needing any tool like yacc — and they are just about as powerful.

One remaining issue with packrat parsers is that they don’t natively support left recursion, making it hard to translate yacc-style parsers. However a couple of years ago an interesting paper was published, Packrat Parsers Can Support Left Recursion (PDF) which shows how to modify the memoization to support left recursion.

This really works, but unfortunately the pseudocode used in the paper is obscure and it’s hard to find example code, so I implemented this packrat parser with support for left recursion in OCaml.

Compile with:

$ ocamlfind opt -package extlib -linkpkg \
    testparse.ml -o testparse

and run it like this:

$ cat input
( fun a -> a + 1 ) (a + b)
$ ./testparse < input

open Printf

type grammar = production list
and production = string * choice list   (* rule name -> e1 | e2 | ... *)
and choice = string * string * parsing list (* choice name, comment, sequence *)
and parsing =
  | Z of parsing                        (* e* *)
  | P of parsing                        (* e+ *)
  | Opt of parsing                      (* e? *)
(*  | Lookahead of parsing                (* &e *)
  | Not of parsing                      (* !e *) *)
  | NT of string                        (* nonterminal 'name' *)
  | T of string                         (* terminal *)
  | Chars of char * char                (* [a-z] *)
  | Empty                               (* epsilon *)

let grammar : grammar = [
  (*
    start -> expr
  *)
  "start", [
    "", "", [NT "expr"]
  ];

  (*
    expr ->
          | expr expr
          | "fun" patt "->" expr = "fundecl" ; function
          | "(" expr ")"
          | expr "+" expr
          | ident
          | int64
  *)
  "expr", [
    "funappl", "", [NT"expr"; NT"sp"; NT"expr"];
    "fundecl", "function declaration",
      [T"fun"; NT"sp"; NT"patt"; NT"osp"; T"->"; NT"sp"; NT"expr"];
    "paren", "", [T"("; NT"osp"; NT"expr"; NT"osp"; T")"];
    "addition", "", [NT"expr"; NT"osp"; T"+"; NT"osp"; NT"expr"];
    "ident", "", [NT"ident"];
    "int64", "", [NT"int64"];
  ];

  (*
    patt -> ident
  *)
  "patt", [
    "", "", [NT "ident"]
  ];

  (*
    int64 -> digit19 digit*
           | digit
    digit -> [0-9]
    digit19 -> [1-9]
  *)
  "int64", [
    (*"", "", [NT"digit19"; Z(NT"digit")]; XXX *)
    "", "", [NT"digit"];
  ];
  "digit", [
    "", "", [Chars ('0', '9')];
  ];
  "digit19", [
    "", "", [Chars ('1', '9')];
  ];

  (*
    ident -> alpha+
    alpha -> [a-z]
  *)
  "ident", [
    "", "", [P (NT"alpha")];
  ];
  "alpha", [
    "", "", [Chars ('a', 'z')];
  ];

  (*
    sp -> " "+
    osp -> " "*
  *)
  "sp", [
    "", "", [P (T" ")];
  ];
  "osp", [
    "", "", [Z (T" ")];
  ];
]

let initial_state = "start"

(* To support backtracking, need to read all input. *)
let input =
  let lines = ref [] in
  (try
     while true do
       lines := read_line () :: !lines
     done
   with
     End_of_file -> ()
  );
  let lines = List.rev !lines in
  String.concat " " lines

let len = String.length input

(* Parser.
 * Packrat parser with left recursion, see:
 * "Packrat Parsers Can Support Left Recursion"
 * Alessandro Warth, James R. Douglass, Todd Millstein
 *)
module StringSet = Set.Make (String)
exception Found of int

type lr = {
  mutable seed : ans;
  mutable rulename : string;
  mutable head : head option;
}
and memoentry = { mutable ans : ans_or_lr; mutable pos : int }
and ans_or_lr = Answer of ans | LR of lr
and ans = int
and head = {
  mutable hrule : string;
  mutable involved_set : StringSet.t;
  mutable eval_set : StringSet.t;
}

let lrstack = ref []
let memo = Hashtbl.create 13
let pos = ref 0
let heads = Hashtbl.create 13

(* Apply rule 'rulename' at position 'i' in the input.  Returns the new
 * position if the rule can be applied, else -1 if fails.
 *)
let rec apply_rule rulename i =
  match recall rulename i with
  | None ->
      let lr = { seed = -1; rulename = rulename; head = None } in
      lrstack := lr :: !lrstack;
      let m = { ans = LR lr; pos = i } in
      Hashtbl.add memo (rulename, i) m;
      let r = parse rulename i in
      lrstack := List.tl !lrstack;
      m.pos <- !pos;
      if lr.head <> None then (
        lr.seed <- r;
        lr_answer rulename i m
      ) else (
        m.ans <- Answer r;
        r
      )
  | Some m ->
      pos := m.pos;
      match m.ans with
      | LR lr -> setup_lr rulename lr; lr.seed
      | Answer r -> r

and setup_lr rulename lr =
  if lr.head = None then
    lr.head <- Some { hrule = rulename;
                      involved_set = StringSet.empty;
                      eval_set = StringSet.empty };
  let lr_head = Option.get lr.head in
  let rec loop = function
    | [] -> assert false
    | l::_ when l.head = Some lr_head -> ()
    | l::ls ->
        l.head <- Some lr_head;
        lr_head.involved_set <- StringSet.add l.rulename lr_head.involved_set;
        loop ls
  in
  loop !lrstack

and lr_answer rulename i m =
  let lr = match m.ans with
    | Answer _ -> assert false
    | LR lr -> lr in
  let h = match lr.head with
    | None -> assert false
    | Some h -> h in
  if h.hrule <> rulename then
    lr.seed
  else (
    m.ans <- Answer lr.seed;
    if lr.seed = -1 then -1
    else grow_lr rulename i m h
  )

and recall rulename i =
  let m = try Some (Hashtbl.find memo (rulename, i)) with Not_found -> None in
  let h = try Some (Hashtbl.find heads i) with Not_found -> None in
  match h with
  | None -> m
  | Some h ->
      if m = None && not (StringSet.mem rulename
                            (StringSet.add h.hrule h.involved_set)) then
        Some { ans = Answer (-1); pos = i }
      else (
        if StringSet.mem rulename h.eval_set then (
          h.eval_set <- StringSet.remove rulename h.eval_set;
          let r = parse rulename i in
          (* Original paper RECALL function seems to have a bug ... *)
          let m = Option.get m in
          m.ans <- Answer r;
          m.pos <- !pos
        );
        m
      )

and grow_lr rulename i m h =
  Hashtbl.replace heads i h; (* A *)
  let rec loop () =
    pos := i;
    h.eval_set <- h.involved_set; (* B *)
    let ans = parse rulename i in
    if ans = -1 || !pos <= m.pos then ()
    else (
      m.ans <- Answer ans;
      m.pos <- !pos;
      loop ()
    )
  in
  loop ();
  Hashtbl.remove heads i; (* C *)
  pos := m.pos;
  match m.ans with
  | Answer r -> r
  | LR _ -> assert false

and parse rulename i =
  printf "parse %s %d\n" rulename i;
  let choices =
    try List.assoc rulename grammar
    with Not_found ->
      eprintf "error in grammar: unknown rulename '%s'\n" rulename;
      exit 1 in
  (* Try each choice in turn until one matches. *)
  try
    List.iter (
      fun (name, comment, rs) ->
        printf "parse %s \"%s\" %d\n" rulename name i;
        let rec loop i = function
          | Empty :: rest ->
              loop i rest
          | NT n :: rest ->
              let i' = apply_rule n i in
              if i' >= i then loop i' rest else (-1)
          | T str :: rest ->
              let slen = String.length str in
              if i + slen > len then (-1)
              else (
                let sub = String.sub input i slen in
                if sub = str then loop (i+slen) rest
                else (-1)
              )
          | Chars (c1, c2) :: rest ->
              if i >= len then (-1)
              else if input.[i] >= c1 && input.[i] <= c2 then
                loop (i+1) rest
              else (-1)
          | Z subr :: rest ->
              let i' = greedy 0 None subr i in
              if i' >= i then loop i' rest else (-1)
          | P subr :: rest ->
              let i' = greedy 1 None subr i in
              if i' >= i then loop i' rest else (-1)
          | Opt subr :: rest ->
              let i' = greedy 0 (Some 1) subr i in
              if i' >= i then loop i' rest else (-1)
          | [] -> i
        and greedy min max subr i = (* implements e* e+ e? *)
          printf "greedy %d %s %d\n" min
            (match max with None -> "None" | Some n -> sprintf "Some %d" n)
            i;
          match max with
          | Some 0 -> i
          | _ ->
              if min > 0 then (
                (* we must match at least min or fail *)
                let i' = loop i [subr] in
                if i' >= i then greedy (min-1) max subr i'
                else (-1)
              ) else (
                (* try matching, doesn't matter if we fail *)
                let i' = loop i [subr] in
                if i' >= i then (
                  let max =
                    match max with None -> None | Some n -> Some (n-1) in
                  greedy 0 max subr i'
                )
                else i (* don't fail, return longest match *)
              )
        in
        let i' = loop i rs in
        if i' >= i then (
          printf "match %s \"%s\" [%d..%d]\n" rulename name i (i'-1);
          raise (Found i')
        )
    ) choices;
    (-1)
  with
    Found i -> i

let () =
  let i = apply_rule initial_state 0 in
  if i = -1 then (
    eprintf "parse error: parsing failed\n";
    exit 1
  )
  else if i < len then (
    eprintf "parse error: extra characters after end of input\n";
    exit 1
  )
  else
    printf "parsed OK\n"

6 Comments

Filed under Uncategorized

6 responses to “Packrat parser with left recursion

  1. Left-recursive PEGs are really alluring. I similarly had problems understanding the Warth et al. paper initially. I eventually got my head around it (and came up with what I hope is an easier description of the core of the algorithm) only to discover that it has problems in some cases. Not everyone agrees with this yet, but some people do – YMMV. I wrote it up here. The good news is that I think we can solve some cases relatively easily; but a general solution might be quite hard.

    • rich

      Thanks — the paper looks interesting although it’s a bit too late at night for me to have any intelligent comments on that now.

      I just thought I’d say that I quickly found PEGs to be close to useless without left recursion, so the Warth paper at least makes them useful (for me). I couldn’t work out how to express a simple OCaml-like language without using left recursion for expressions. But do you think I’m just too used to yacc and I should learn how to refactor grammars?

      • I agree entirely – no left-recursion makes using PEGs, as traditionally defined, pretty horrible. I was pretty disappointed when I discovered what I think are problems with the published PEG left-recursion algorithm, as it seemed so promising initially.

        What the least-worst parsing technology is probably depends on your use-case. For bog-standard parsing, I typically use an Earley parser which can parse any CFG; there are other algorithms (e.g. GLR) which have the same end effect. The problem with such algorithms is that they allow ambiguity; with grammars for single languages that isn’t generally a terrible problem in practice. If you start combining languages, though, ambiguity is a real pain. I had hoped that PEGs would be a good solution for this use case but, currently, I’m not sure that they are.

  2. Conrad Meyer

    Can you post the source for testparse.ml?

    • Conrad Meyer

      Oh, I misunderstood, nevermind. (I thought you had posted just the library code — I didn’t spot the example grammar in the middle of it.)

  3. Kevembuangga

    Thanks for the idea of coding directly the grammar in Ocaml.
    I gave it a try and I think I solved the mutual left recursion problem.
    If you want to have a look…
    http://www.kevembuangga.com/lrtt/

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s