~williewillus/persistent

fa6bb1cf970d383964dbe0e5b8c47b74e8520d03 — Vincent Lee 1 year, 9 months ago
Init
8 files changed, 297 insertions(+), 0 deletions(-)

A .gitignore
A dune-project
A lib/dune
A lib/vector.ml
A lib/vector.mli
A persistent.opam
A test/dune
A test/persistent.ml
A  => .gitignore +1 -0
@@ 1,1 @@
_build/

A  => dune-project +26 -0
@@ 1,26 @@
(lang dune 3.7)

(name persistent)

(generate_opam_files true)

(source
 (github username/reponame))

(authors "Vincent Lee")

(maintainers "Vincent Lee")

(license LICENSE)

(documentation https://url/to/documentation)

(package
 (name persistent)
 (synopsis "Clojure-style persistent data structures, sequences, and transducers for OCaml")
 (description "A longer description")
 (depends ocaml dune)
 (tags
  (topics "to describe" your project)))

; See the complete stanza docs at https://dune.readthedocs.io/en/stable/dune-files.html#dune-project

A  => lib/dune +3 -0
@@ 1,3 @@
(library
 (public_name persistent)
 (name persistent))

A  => lib/vector.ml +213 -0
@@ 1,213 @@
(* Persistent Vectors are implement as a wide-branching tree. Here, the branch factor
   is chosen to be 32, which means at most 5 hops in the tree need to be made to find
   the element in question.
   TODO: How does this square with 32 bit OCaml only having 31 real bits available?
 *)
let bits_per_level = 5
let branch_factor = 1 lsl bits_per_level

(* Paths down into the tree are indexed by the bits of the logical index, left to right,
   in groups of `bits_per_level`. Note that unused bits are kept on the left. That is,
   if a given vector's size is small enough to be addressed with 15 bits, it is the rightmost
   15 bits that are used, and all upper bits are unused.
 *)

(* inner_mask is used to access items within a single level of the tree *)
let inner_mask = branch_factor - 1
(* outer_mask is used to hop between levels of the tree *)
let outer_mask = lnot inner_mask

type 'a node = {
    array : 'a node_item array;
  } [@@unboxed]
and 'a node_item =
  (* Actual data to be stored *)
  | Leaf of 'a
  (* Continue downwards into the tree *)
  | Inner of 'a node
  (* Nothing along this path *)
  | Empty

(* TODO: Figure out if I can make this covariant (see map.ml) *)
type 'a t = {
    (* The number of elements in the vector *)
    count : int;
    (* The root of the tree *)
    root : 'a node;
    (* Height of the tree, but in bits instead of logical levels.
       Storing it this way allows for the logic below to not have to
       constantly be multiplying by bits_per_level when doing bitwise operations.
       Empty vector has height `bits_per_level`, i.e. this value is never zero.
     *)
    height : int;
    (* The "tail" is an optimization to the vector that allows true-constant time
       access to the end of the vector, in recognition of the fact that vectors
       are often pushed and popped at the end.
       Typed as 'a node_item to match node type, but is logically 'a option array.
       i.e., this should never hold Inner, only Leaf or Empty *)
    tail : 'a node_item array;
  }

let empty_value = {
    count = 0;
    height = bits_per_level;
    root = { array = Array.make branch_factor Empty };
    tail = [||];
  }

let empty () : 'a t  =
  (* XXX: Extremely ugly!!
     If I just expose `empty_value` directly, I get typecheck errors.
     I believe this is because the typechecker wants only one concrete type
     parameter 'a, and I don't know how to convince it that I can share one empty_value
     for all possible type parameters 'a.
     I think it's because the arrays in empty_value
     are technically mutable, but we never mutate them, so it's okay.
     Use the ugly cast in the meantime.
   *)
  Obj.magic empty_value

(** Returns the length of the vector *)
let length (t : 'a t) : int = t.count

(** Returns the logical index represented by tail.(0) *)
let tail_offset (t : 'a t) : int =
  let count = (length t) in
  if count < branch_factor then
    0
  else
    (count - 1) land outer_mask

(** Returns the array holding the element at logical index i *)
let array_for (t : 'a t) (i : int) : 'a node_item array option =
  if i < 0 || i >= (length t) then
    None
  else if i >= (tail_offset t) then
    Some t.tail
  else
    let rec descend node height =
      if height <= 0 then
        node.array
      else
        let idx = (i lsr height) land inner_mask in
        match node.array.(idx) with
        | Inner node -> descend node (height - bits_per_level)
        | _ -> failwith "Invariant violation, should only have inner nodes in array_for"
    in
    Some (descend t.root t.height)

(** Gets the element at index, or None if out of bounds *)
let nth_opt (t : 'a t) (index : int) : 'a option =
  let last_level_index = index land inner_mask in
  match array_for t index with
  | Some array when last_level_index < (Array.length array) ->
     (match array.(last_level_index) with
      | Leaf value -> Some value
      | Empty -> None
      | Inner _ -> failwith "Invariant violation, not expecting Inner in last-level array")
  | _ -> None

(** Gets the element at index, or raises Invalid_argument if out of bounds *)
let nth (t : 'a t) (index : int) : 'a =
  match nth_opt t index with
  | Some value -> value
  | None -> invalid_arg "Index out of bounds"

(** Appends an element to the vector *)
let append (t : 'a t) (value : 'a) : 'a t =
  let old_count = length t in
  if old_count - (tail_offset t) < branch_factor then
    (* *)
    (* Create a new tail that is one longer.
       Note: Array.init is no faster than this,
       because it gets the zeroth element, fills the
       array with that, then sets all the subsequent elements.
       Do it manually here to avoid a closure allocation.
     *)
    let old_tail_length = Array.length t.tail in
    let new_tail = Array.make (old_tail_length + 1) (Leaf value) in
    Array.blit t.tail 0 new_tail 0 old_tail_length;
    { t with tail = new_tail; count = old_count + 1 }
  else
    (* No more room in tail, insert the tail into the tree and make a new tail *)
    let rec create_hierarchy height node =
      if Int.equal 0 height then
        node
      else
        let new_node = { array = Array.make branch_factor Empty } in
        let child = create_hierarchy (height - bits_per_level) node in
        new_node.array.(0) <- Inner child;
        new_node
    in
    let rec push_tail height parent tail =
      let sub_index = ((old_count - 1) lsr height) land inner_mask in
      let result = { array = Array.copy parent.array } in
      let to_insert =
        if Int.equal height bits_per_level then
          tail
        else
          match parent.array.(sub_index) with
          | Inner child -> push_tail (height - bits_per_level) child tail
          | Empty -> create_hierarchy (height - bits_per_level) tail
          | Leaf _ -> failwith "Should not see leaves here"
      in
      result.array.(sub_index) <- Inner to_insert;
      result
    in
    let node_to_insert = { array = t.tail } in
    let (root, height) =
      if (old_count lsr bits_per_level) <= (1 lsl t.height) then
        (* Tree doesn't need to grow taller *)
        let new_root = push_tail t.height t.root node_to_insert in
        new_root, t.height
      else
        (* Tree needs to grow taller *)
        let new_root = { array = Array.make branch_factor Empty } in
        new_root.array.(0) <- Inner t.root;
        new_root.array.(1) <- Inner (create_hierarchy t.height node_to_insert);
        new_root, t.height + bits_per_level
    in
    {
      root;
      height;
      count = old_count + 1;
      tail = [|(Leaf value)|];
    }

(* Returns a new vector with index i associated to a. If i is equal to the current vector size, a is appended to the vector, expanding it.
   If i is less than zero or greater than the current vector size, Invalid_argument is raised. *)
let update (t : 'a t) (i : int) (value : 'a) : 'a t =
  let count = length t in
  if i < 0 || i > count then
    invalid_arg "Out of bounds"
  else if Int.equal i count then
    (* TODO: reconsider this design choice of clojure? *)
    append t value
  else if i >= (tail_offset t) then
    (* Replace in tail *)
    let new_tail = Array.copy t.tail in
    new_tail.(i land inner_mask) <- Leaf value;
    { t with tail = new_tail }
  else
    (* Replace in tree by rebuilding the path down to the node *)
    let rec rebuild node height =
      let new_node = { array = Array.copy node.array } in
      if Int.equal height 0 then
        let () = new_node.array.(i land inner_mask) <- Leaf value in
        new_node
      else
        let child_index = (i lsr height) land inner_mask in
        let old_child = match node.array.(child_index) with
          | Inner node -> node
          | _ -> failwith "Invariant violation, only expecting Inner"
        in
        let child = rebuild old_child (height - bits_per_level) in
        let () = new_node.array.(child_index) <- Inner child in
        new_node
    in
    { t with root = rebuild t.root t.height }

(** Dead-simple Sequence implementation.
    TODO: optimize by holding onto the leaf so we don't have to walk down every time *)
let to_seq (t : 'a t) : 'a Seq.t =
  Seq.init t.count (fun i -> nth t i)

A  => lib/vector.mli +20 -0
@@ 1,20 @@
type 'a t

(** Returns the length of the vector *)
val length : 'a t -> int

(** Returns the empty vector *)
val empty : unit -> 'a t

(** Gets the element at index, or None if out of bounds *)
val nth_opt : 'a t -> int -> 'a option

(** Gets the element at index, or raises Invalid_argument if out of bounds *)
val nth : 'a t -> int -> 'a

(** Appends an element to the vector *)
val append : 'a t -> 'a -> 'a t

val update : 'a t -> int -> 'a -> 'a t

val to_seq : 'a t -> 'a Seq.t

A  => persistent.opam +32 -0
@@ 1,32 @@
# This file is generated by dune, edit dune-project instead
opam-version: "2.0"
synopsis:
  "Clojure-style persistent data structures, sequences, and transducers for OCaml"
description: "A longer description"
maintainer: ["Vincent Lee"]
authors: ["Vincent Lee"]
license: "LICENSE"
tags: ["topics" "to describe" "your" "project"]
homepage: "https://github.com/username/reponame"
doc: "https://url/to/documentation"
bug-reports: "https://github.com/username/reponame/issues"
depends: [
  "ocaml"
  "dune" {>= "3.7"}
  "odoc" {with-doc}
]
build: [
  ["dune" "subst"] {dev}
  [
    "dune"
    "build"
    "-p"
    name
    "-j"
    jobs
    "@install"
    "@runtest" {with-test}
    "@doc" {with-doc}
  ]
]
dev-repo: "git+https://github.com/username/reponame.git"

A  => test/dune +2 -0
@@ 1,2 @@
(test
 (name persistent))

A  => test/persistent.ml +0 -0