Tag Archives: ocaml

More static analysis with CIL

Years ago I played around with CIL to analyze libvirt. More recently Dan used CIL to analyze libvirt’s locking code.

We didn’t get so far either time, but I’ve been taking a deeper look at CIL in an attempt to verify error handling in libguestfs.

Here is my partly working code so far.

(*
 * Analyse libguestfs APIs to find error overwriting.
 * Copyright (C) 2008-2013 Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library.  If not, see
 * <http://www.gnu.org/licenses/>.
 *
 * Author: Daniel P. Berrange <berrange@redhat.com>
 * Author: Richard W.M. Jones <rjones@redhat.com>
 *)

open Unix
open Printf

open Cil

let debug = ref false

(* Set of ints. *)
module IntSet = Set.Make (struct type t = int let compare = compare end)

(* A module for storing any set (unordered list) of functions. *)
module FunctionSet = Set.Make (
  struct
    type t = varinfo
    let compare v1 v2 = compare v1.vid v2.vid
  end
)

(* Directed graph of functions.
 *
 * Function = a node in the graph
 * FunctionDigraph = the directed graph
 * FunctionPathChecker = path checker module using Dijkstra's algorithm
 *)
module Function =
struct
  type t = varinfo
  let compare f1 f2 = compare f1.vid f2.vid
  let hash f = Hashtbl.hash f.vid
  let equal f1 f2 = f1.vid = f2.vid
end
module FunctionDigraph = Graph.Imperative.Digraph.Concrete (Function)
module FunctionPathChecker = Graph.Path.Check (FunctionDigraph)

(* Module used to analyze the paths through each function. *)
module ErrorCounter =
struct
  let name = "ErrorCounter"
  let debug = debug

  (* Our current state is very simple, just the number of error
   * function calls did encountered up to this statement.
   *)
  type t = int

  let copy errcalls = errcalls

  (* Start data for each statement. *)
  let stmtStartData = Inthash.create 97

  let printable errcalls = sprintf "(errcalls=%d)" errcalls

  let pretty () t = Pretty.text (printable t)

  let computeFirstPredecessor stmt x = x (* XXX??? *)

  let combinePredecessors stmt ~old:old_t new_t =
    if old_t = new_t then None
    else Some new_t

  (* This will be initialized after we have calculated the set of all
   * functions which can call an error function, in main() below.
   *)
  let error_functions_set = ref FunctionSet.empty

  (* Handle a Cil.Instr. *)
  let doInstr instr _ =
    match instr with
    (* A call to an error function. *)
    | Call (_, Lval (Var callee, _), _, _)
        when FunctionSet.mem callee !error_functions_set ->
      Dataflow.Post (fun errcalls -> errcalls+1)

    | _ -> Dataflow.Default

  (* Handle a Cil.Stmt. *)
  let doStmt _ _ = Dataflow.SDefault

  (* Handle a Cil.Guard. *)
  let doGuard _ _ = Dataflow.GDefault

  (* Filter statements we've seen already to prevent loops. *)
  let filter_set = ref IntSet.empty
  let filterStmt { sid = sid } =
    if IntSet.mem sid !filter_set then false
    else (
      filter_set := IntSet.add sid !filter_set;
      true
    )

  (* Initialize the module before each function that we examine. *)
  let init stmts =
    filter_set := IntSet.empty;
    Inthash.clear stmtStartData;
    (* Add the initial statement(s) to the hash. *)
    List.iter (fun stmt -> Inthash.add stmtStartData stmt.sid 0) stmts
end

module ForwardsErrorCounter = Dataflow.ForwardsDataFlow (ErrorCounter)

(* The always useful filter + map function. *)
let rec filter_map f = function
  | [] -> []
  | x :: xs ->
      match f x with
      | Some y -> y :: filter_map f xs
      | None -> filter_map f xs

let rec main () =
  (* Read the list of input C files. *)
  let files =
    let chan = open_process_in "find src -name '*.i' | sort" in
    let files = input_chan chan in
    if close_process_in chan <> WEXITED 0 then
      failwith "failed to read input list of files";
    if files = [] then
      failwith "no input files; is the program running from the top directory? did you compile with make -C src CFLAGS=\"-save-temps\"?";
    files in

  (* Load and parse each input file. *)
  let files =
    List.map (
      fun filename ->
        printf "loading %s\n%!" filename;
        Frontc.parse filename ()
    ) files in

  (* Merge the files. *)
  printf "merging files\n%!";
  let sourcecode = Mergecil.merge files "libguestfs" in

  (* CFG analysis. *)
  printf "computing control flow\n%!";
  Cfg.computeFileCFG sourcecode;

  let functions =
    filter_map (function GFun (f, loc) -> Some (f, loc) | _ -> None)
      sourcecode.globals in

  (* Examine which functions directly call which other functions. *)
  printf "computing call graph\n%!";
  let call_graph = make_call_graph functions in
  (*
  FunctionDigraph.iter_edges (
    fun caller callee ->
      printf "%s calls %s\n" caller.vname callee.vname
  ) call_graph;
  *)

  (* The libguestfs error functions.  These are global function names,
   * but to be any use to us we have to look these up in the list of
   * all global functions (ie. 'functions') and turn them into the
   * corresponding varinfo structures.
   *)
  let error_function_names = [ "guestfs_error_errno";
                               "guestfs_perrorf" ] in

  let find_function name =
    try List.find (fun ({ svar = { vname = n }}, _) -> n = name) functions
    with Not_found -> failwith ("function '" ^ name ^ "' does not exist")
  in
  let error_function_names = List.map (
    fun f -> (fst (find_function f)).svar
  ) error_function_names in

  (* Get a list of functions that might (directly or indirectly) call
   * one of the error functions.
   *)
  let error_functions, non_error_functions =
    functions_which_call call_graph error_function_names functions in

  (*
  List.iter (
    fun f -> printf "%s can call an error function\n" f.vname
  ) error_functions;

  List.iter (
    fun f -> printf "%s can NOT call an error function\n" f.vname
  ) non_error_functions;
  *)

  (* Save the list of error functions in a global set for fast lookups. *)
  let set =
    List.fold_left (
      fun set elt -> FunctionSet.add elt set
    ) FunctionSet.empty error_functions in
  ErrorCounter.error_functions_set := set;

  (* Analyze each top-level function to ensure it calls an error
   * function exactly once on error paths, and never on normal return
   * paths.
   *)
  printf "analyzing correctness of error paths\n%!";
  List.iter compute_error_paths functions;

  ()

(* Make a directed graph of which functions directly call which other
 * functions.
 *)
and make_call_graph functions =
  let graph = FunctionDigraph.create () in

  List.iter (
    fun ({ svar = caller; sallstmts = sallstmts }, _) ->
      (* Evaluate which other functions 'caller' calls.  First pull
       * out every 'Call' instruction anywhere in the function.
       *)
      let insns =  List.concat (
        filter_map (
          function
          | { skind = Instr insns } -> Some insns
          | _ -> None
        ) sallstmts
      ) in
      let calls = List.filter (function Call _ -> true | _ -> false) insns in
      (* Then examine what function is being called at each place. *)
      let callees = filter_map (
        function
        | Call (_, Lval (Var callee, _), _, _) -> Some callee
        | _ -> None
      ) calls in

      List.iter (
        fun callee ->
          FunctionDigraph.add_edge graph caller callee
      ) callees
  ) functions;

  graph

(* [functions_which_call g endpoints functions] partitions the
 * [functions] list, returning those functions that call directly or
 * indirectly one of the functions in [endpoints], and a separate list
 * of functions which do not.  [g] is the direct call graph.
 *)
and functions_which_call g endpoints functions =
  let functions = List.map (fun ({ svar = svar }, _) -> svar) functions in

  let checker = FunctionPathChecker.create g in
  List.partition (
    fun f ->
      (* Does a path exist from f to any of the endpoints? *)
      List.exists (
        fun endpoint ->
          try FunctionPathChecker.check_path checker f endpoint
          with
          (* It appears safe to ignore this exception.  It seems to
           * mean that this function is in a part of the graph which
           * is completely disconnected from the other part of the graph
           * that contains the endpoint.
           *)
          | Invalid_argument "[ocamlgraph] iter_succ" -> false
      ) endpoints
  ) functions

and compute_error_paths ({ svar = svar } as f, loc) =
  (*ErrorCounter.debug := true;*)

  (* Find the initial statement in this function (assumes that the
   * function can only be entered in one place, which is normal for C
   * functions).
   *)
  let initial_stmts =
    match f.sbody.bstmts with
    | [] -> []
    | first::_ -> [first] in

  (* Initialize ErrorCounter. *)
  ErrorCounter.init initial_stmts;

  (* Compute the error counters along paths through the function. *)
  ForwardsErrorCounter.compute initial_stmts;

  (* Process all Return statements in this function. *)
  List.iter (
    fun stmt ->
      try
        let errcalls = Inthash.find ErrorCounter.stmtStartData stmt.sid in

        match stmt with
        (* return -1; *)
        | { skind = Return (Some i, loc) } when is_literal_minus_one i ->
          if errcalls = 0 then
            printf "%s:%d: %s: may return an error code without calling error/perrorf\n"
              loc.file loc.line svar.vname
          else if errcalls > 1 then
            printf "%s:%d: %s: may call error/perrorf %d times (more than once) along an error path\n"
          loc.file loc.line svar.vname errcalls

        (* return 0; *)
        | { skind = Return (Some i, loc) } when is_literal_zero i ->
          if errcalls >= 1 then
            printf "%s:%d: %s: may call error/perrorf along a non-error return path\n"
              loc.file loc.line svar.vname

        (* return; (void return) *)
        | { skind = Return (None, loc) } ->
          if errcalls >= 1 then
            printf "%s:%d: %s: may call error/perrorf and return void\n"
              loc.file loc.line svar.vname

        | _ -> ()

      with
        Not_found ->
          printf "%s:%d: %s: may contain unreachable code\n"
            loc.file loc.line svar.vname
  ) f.sallstmts

(* Some convenience CIL matching functions. *)
and is_literal_minus_one = function
  | Const (CInt64 (-1L, _, _)) -> true
  | _ -> false

and is_literal_zero = function
  | Const (CInt64 (0L, _, _)) -> true
  | _ -> false

(* Convenient routine to load the contents of a channel into a list of
 * strings.
 *)
and input_chan chan =
  let lines = ref [] in
  try while true; do lines := input_line chan :: !lines done; []
  with End_of_file -> List.rev !lines

and input_file filename =
  let chan = open_in filename in
  let r = input_chan chan in
  close_in chan;
  r

let () =
  try main ()
  with 
    exn ->
      prerr_endline (Printexc.to_string exn);
      Printexc.print_backtrace Pervasives.stderr;
      exit 1

8 Comments

Filed under Uncategorized

OCaml Users and Developers slides and videos

http://oud.ocaml.org/2012/#schedule (thanks Anil).

Leave a Comment

Filed under Uncategorized

xavierbot lives!

Or at least he now has his own git repository.

1 Comment

Filed under Uncategorized

OCaml 4.00.0 beta 2 is in Rawhide

OCaml 4.00.0 betas are available and I’ve packaged it up for Fedora Rawhide.

In other OCaml news, the various “retired” code generators (PPC64, MIPS, IA64, HPPA, Alpha) now have their own repository. Who wants to volunteer to write an s/390x code generator?

Leave a Comment

Filed under Uncategorized

OCaml Users and Developers conference 2012

The OCaml Users and Developers Conference (14th September 2012, Copenhagen) is looking for speakers!

I’m intending to submit a talk on using OCaml for code generation in libguestfs.

Leave a Comment

Filed under Uncategorized

whenjobs 0.7.0 released

I’ve just released whenjobs 0.7.0 to fix a couple of problems that I was having myself.

Firstly, while it’s a nice feature of whenjobs that I can set variables, sometimes I don’t want jobs to be triggered as a result of setting variables. The new whenjobs --whisper command lets me set variables without reevaluating when-clauses:

$ whenjobs --whisper libguestfs_version=1.17.16

Secondly, my jobs file was getting pretty long and unmaintainable (400 lines), since I added various libguestfs branches and hivex to be managed by whenjobs. So I’ve added a feature where you can split the jobs file into multiple files:

$ ls -l .whenjobs/*.ml
-rw-rw-r--. 1 rjones rjones 3284 Mar 13 19:10 .whenjobs/jobs_hivex.ml
-rw-rw-r--. 1 rjones rjones 4039 Mar 13 19:07 .whenjobs/jobs_libguestfs.ml
-rw-rw-r--. 1 rjones rjones 3766 Mar 13 19:09 .whenjobs/jobs_libguestfs_stable.ml
-rw-rw-r--. 1 rjones rjones   45 Mar 13 19:10 .whenjobs/jobs.ml

I think I described whenjobs as a “cron replacement”. It is a cron replacement for me, for my personal use, but it’s not officially a cron replacement and this project has nothing whatsoever to do with Red Hat or Fedora. The reason I say this is I get some pretty idiotic comments like these.

Leave a Comment

Filed under Uncategorized

whenjobs — job lists, cancelling, algorithmic cleanup etc

You can now list and cancel jobs:

$ whenjobs --jobs
61 job$1
	running in: /tmp/whenjobs20d88a48f2c4eb0062e1b44ded6d0ae7
	started at: 2012-02-23 22:43:20
62 job$2
	running in: /tmp/whenjobse9e6b93c3ced1967cbf8c5865d6a1ccb
	started at: 2012-02-23 22:43:20
$ whenjobs --cancel 62

You can manually start jobs. Gerd’s ocamlnet makes it almost trivial to add new RPCs between the tool and the daemon, so adding functions like this is simple.

You can put arbitrary OCaml actions into the job script too, so you can run code when a job is cleaned up, and you will (soon) be able to create jobs algorithmically. For example, the standard mailto cleanup lets you send mail containing the output of the job when it finishes.

let from = "me@example.com"
let to_addr = "you@example.com"
let prefix = "hostname "
let script = << # shell script here >>

job (prefix ^ "poll")
cleanup (Whentools.mailto ~from to_addr)
every minute : script

Leave a Comment

Filed under Uncategorized

A work in progress: whenjobs — another cron replacement

whenjobs (git repo) is a cron replacement. From the manual page …

Whenjobs is a powerful but simple replacement for cron. It lets you run jobs periodically like cron, but it also lets you trigger jobs to run when user-defined variables are set or change value.

Periodic jobs are written like this:

every 10 minutes :
<<
  # Get the current load average.
  load=`awk '{print $1}' /proc/loadavg`
  whenjobs --set load $load --type float
>>

When-statements let you create jobs that run based on variables set elsewhere:

when load >= 6 :
<<
  mail -s "ALERT: high load average: $load" $LOGNAME < /dev/null
>>

(When statements are "edge-triggered", meaning that this job will only run when the load goes from under 6 to ≥ 6).

The motivation is building things from git automatically. Here is another job script:

Every 10 minutes, get the latest tagged version from the git repository. The variable ‘version’ will be set to something like “v1.2.3″, “v1.2.4″, etc over time as new releases get tagged.

every 10 minutes :
<<
  cd /my/git/repo
  tag=`git-describe --tags`
  whenjobs --set version $tag
>>

When the ‘version’ variable changes (ie. a new release is tagged) try to build it. ‘changes’ is a function that compares the previous value of a variable from when this job last ran with the current value of a variable, and returns true if the previous and current values are different.

when changes version :
<<
  cd /my/git/buildrepo
  git pull
  git reset --hard $version
  ./configure
  make clean all check dist
  whenjobs --set successful_local_build $version
>>

In parallel, build on a remote machine.

when changes version :
<<
  ssh remote ./do_build $version
  whenjobs --set successful_remote_build $version
>>

Only when the new release has been successfully built on local and remote, upload it to the website.

when successful_local_build == version &&
     successful_remote_build == version :
<<
  cd /my/git/buildrepo
  curl -T name-$success.tar.gz ftp://ftp.example.com/upload/
>>

3 Comments

Filed under Uncategorized

Which foreign function interface is the best?

I’ve written libguestfs language bindings for Perl, Python, Ruby, Java, OCaml, PHP, Haskell, Erlang and C#. But which of these is the best? Which is the easiest? What makes this hard? Grubbing around in the internals of a language reveals mistakes made by the language designers, but what are the worst mistakes?

Note: There is source that goes with this. Download libguestfs-1.13.13.tar.gz and look in the respective directories.

The best

It’s going to be a controversial choice, but in my opinion: C#. You just add some simple annotations to your functions and structs, and you can call into shared libraries (or “DllImport”s as Microsoft insisted on calling them) directly. It’s just about as easy as directly calling C and that is no simple achievement considering how the underlying runtime of C# is very different from C.

Example: a C struct:

[StructLayout (LayoutKind.Sequential)]
public class _int_bool {
  int i;
  int b;
}

The worst

There are two languages in the doghouse: Haskell and PHP. PHP first because their method of binding is just very broken. For example, 64 bit types aren’t possible on a 32 bit platform. It requires a very complex autoconf setup. And the quality of their implementation is very poor verging on broken — it makes me wonder if the rest of PHP can be this bad.

Haskell: even though I’m an experienced functional programmer and have done a fair bit of Haskell programming in the past, the FFI is deeply strange and very poorly documented. I simply could not work out how to return anything other than integers from my functions. You end up with bindings that look like this:

write_file h path content size = do
  r <- withCString path $ \path -> withCString content $ \content -> withForeignPtr h (\p -> c_write_file p path content (fromIntegral size))
  if (r == -1)
    then do
      err <- last_error h
      fail err
    else return ()

The middle tier

There’s not a lot to choose between OCaml, Ruby, Java and Erlang. For all of them: you write bindings in C, there’s good documentation, it’s a bit tedious but basically mechanical, and in 3 out of 4 you’re dealing with a reasonable garbage collector so you have to be aware of GC issues.

Erlang is slightly peculiar because the method I chose (out of many possible) is to write an external process that talks to the Erlang over stdin/stdout. But I can’t fault their documentation, and the rest of it is sensible.

Example: Here is a function binding in OCaml, but with mechanical changes this could be Ruby, Java or Erlang too:

CAMLprim value
ocaml_guestfs_add_drive_ro (value gv, value filenamev)
{
  CAMLparam2 (gv, filenamev);
  CAMLlocal1 (rv);

  guestfs_h *g = Guestfs_val (gv);
  if (g == NULL)
    ocaml_guestfs_raise_closed ("add_drive_ro");

  char *filename = guestfs_safe_strdup (g, String_val (filenamev));
  int r;

  caml_enter_blocking_section ();
  r = guestfs_add_drive_ro (g, filename);
  caml_leave_blocking_section ();
  free (filename);
  if (r == -1)
    ocaml_guestfs_raise_error (g, "add_drive_ro");

  rv = Val_unit;
  CAMLreturn (rv);
}

The ugly

Perl: Get reading. You’d better start with perlxs because Perl uses its own language — C with bizarre macros on top so your code looks like this:

SV *
is_config (g)
      guestfs_h *g;
PREINIT:
      int r;
   CODE:
      r = guestfs_is_config (g);
      if (r == -1)
        croak ("%s", guestfs_last_error (g));
      RETVAL = newSViv (r);
 OUTPUT:
      RETVAL

After that, get familiar with perlguts. Perl has only 3 structures and you’ll be using them a lot. There are some brilliant things about Perl which shouldn’t be overlooked, including POD which libguestfs uses to make effortless manual pages.

Python: Best described as half arsed. Rather like the language itself.

Python, Ruby, Erlang: If your language depends on “int”, “long”, “long long” without defining what those mean, and differing based on your C compiler and platform, then you’ve made a big mistake that will unfortunately dog you throughout the runtime, FFIs and the language itself. It’s better either to define them precisely (like Java) or to just use int32 and int64 (like OCaml).

And finally, reference counting (Perl, Python). It’s tremendously easy to make mistakes that are fiendishly difficult to track down. It’s a poor way to do GC and it indicates to me that the language designer didn’t know any better.

16 Comments

Filed under Uncategorized

What I learned about AMQP

I’m playing with AMQP at the moment. I thought I’d start off with RabbitMQ first.

The good:

  • It works.
  • OCaml and Python programs can talk to each other.
  • It works across remote hosts. You need to open port 5672/tcp on the firewall.

The bad:

  • RabbitMQ and Apache Qpid use different versions of AMQP and are not interoperable! Good summary of the mess here. This might be resolved when everyone gets around to supporting AMQP 1-0, but even though that standard has been published, no one is expecting interop to happen for at least a year.
  • You can’t cluster different versions of the RabbitMQ broker together.
  • Even if all your hosts are at the same RabbitMQ version, you have to open more firewall ports and make changes to the start-up scripts. (Dynamic ports? Really? Did we learn nothing from NFS?)
  • Long, obscure Erlang error messages which don’t point to the problem. eg. You’ll get a good 25 lines of error message if another process is already bound to a port.
  • Possibly just a Fedora packaging problem: I managed to get my host into some state where it’s impossible to stop the RabbitMQ server except by kill -9, and after that I can’t start or stop it.

2 Comments

Filed under Uncategorized