ocaml · rizo · Feb 12, 2019 · Feb 13, 2019 · Feb 14, 2019 · Feb 14, 2019
diff --git a/notes/testable_examples.md b/notes/testable_examples.md
@@ -0,0 +1,208 @@
+# [DRAFT] Testable examples
+
+Library authors are encouraged to include examples and short snippets of code
+in documentation to demonstrate how to effectively use their library. Such code
+snippets are included in docstrings as code blocks and therefor cannot be
+executed and tested in the same way regular source files are. This leads to
+code duplication for library authors who want to make sure their examples can
+be correctly executed, and to out of date examples when they forget to update
+them, as the library’s API changes.
+
+To address this problem odoc implements the ability to extract code blocks from
+documented interfaces and documentation pages (`mli` and `mld` files
+respectively) into source code files. With this build systems can implement
+user-friendly workflows for execution, testing and even promotion of corrected
+examples. In addition, the extracted examples can be installed as documentation
+assets and thus avoid the need to duplicate them as separate files for
+distribution.
+
+## Named code blocks
+
+In the new version of odoc code blocks can be annotated with a file name. This
+file name is used by odoc to group related code blocks for extraction, and also
+to correctly annotate the markup for syntax highlighting.
+
+The following table demonstrates the two variants of code blocks: the
+traditionally supported *anonymous* code blocks and the new *named* code
+blocks.
+
+| **Anonymous code block** | **Named code block**                |
+| ------------------------ | ----------------------------------- |
+| `"{[" <content> "]}"`    | `"{" <filename> "[" <content> "]}"` |
+
+
+### Code extraction
+
+Both named and anonymous code blocks can be extracted by odoc via the
+command-line interface. Code blocks with the same file name in a given
+documentation file will be concatenated and written into a file with that name.
+Optionally a different output file name for a given group can be provided.
+Users are always required to provide an output file name for extraction of
+anonymous code blocks.
+
+To facilitate debugging and allow the tooling to implement expect-style
+promotions, popularized by cram and dune, the extracted examples can be
+optionally annotated with line numbers and the source file name (see [Line
+number directives](https://caml.inria.fr/pub/docs/manual-ocaml/lex.html#sec86)
+in the OCaml manual).
+
+**Note**: unrelated code blocks do not need to have a unique file name, it is
+recommended to group them by using a file name like `examples.ml` or similar.
+
+The described functionality will also be exposed as a library to facilitate
+integration with build systems and test promotion tooling.
+
+### Syntax highlighting
+
+The file names used to annotated code blocks are also used by odoc to decide
+what language should be used for syntax highlighting in the generated HTML. The
+language is decided based on the file name’s extension.
+
+**Warning:** code blocks without a file name will not have syntax highlighting.
+Once this feature is implemented, the currently used automatic language
+inference should be disabled.
+
+
+## Command-line interface
+
+The following simplified manual page defines the command-line interface:
+
+
+    odoc-extract-code(1)              Odoc Manual             odoc-extract-code(1)
+
+
+    NAME
+           odoc-extract-code - Extract code blocks included in documentation files.
+
+    SYNOPSIS
+           odoc extract-code [OPTION]... FILE
+
+    OPTIONS
+           --name=NAME
+               The name of the code block to extract.
+
+           -o PATH, --output=PATH
+               Output file path. If omitted, the provided NAME will be used.
+               Required for extraction of anonymous code blocks.
+
+           --anonymous
+               Extract code blocks without name. Cannot be used with the `--name'
+               option.
+
+           --with-line-numbers
+               Include line number and file name of the extracted code blocks.
+
+           FILE (required)
+               Input cmti, cmt, cmi, mli or mld file.
+
+
+    Odoc 11VERSION11                                          odoc-extract-code(1)
+
+
+## Dune integration
+
+Here is an excerpt from a documented interface file that demonstrates named code blocks.
+
+**Io_utils.mli**
+
+```ocaml
+val read_file : string -> string
+(** [read_file path] is the content of the file located at [path] read into a string.
+
+    {4 Examples}
+    Given a text file with the content:
+
+    {letters.txt[abcdef]}
+
+    The following example will print the number of letters in the file:
+
+    {count_letters.ml[
+    # let letters = read_file "assets/letters.txt";;
+    val letters : string = "abcdef"
+    # String.length letters;;
+    - : int = 6
+    ]} *)
+```
+
+The user wants to test the two code blocks in this example and all the
+anonymous code blocks. To achieve this, the library stanza can be instructed to
+extract and execute the code blocks from the documentation:
+
+
+```dune
+(library
+  (public_name io-utils)
+  (name Io_utils)
+  (libraries base bos)
+  (documentation
+    (extract_code
+      (letters.txt as assets/letters.txt)
+      count_letters.ml
+      (:anonymous as examples.ml))
+    (execute_code examples.ml count_letters.ml)))
+```
+
+Here is a detailed description of these options:
+
+- `(extract_code <filenames>)` where `<filenames>` field follows the [Ordered
+  set language](http://#). This is a set of code block names found in `mli`
+  files of the library that should be extracted into files. Where `:standard`
+  refers to all annotated code blocks found in the library. Optionally the name
+  of the extracted file can be changed by using the following form:
+  `(<code_block> as <filename>)`, for example, `(letters.txt as
+  assets/letters.txt)`. Untitled code blocks can be extracted by providing a
+  file name to a special `:anonymous` name: `(:anonymous as <filename>)`.
+- `(execute_code <filenames>)` where `<filenames>` field follows the [Ordered
+  set language](http://#). This is a set of extracted code files that will be
+  compiled and executed during documentation generation. Currently only the
+  files with the `ml` and `re` extensions are supported.
+
+With these two options it is possible to precisely control what gets extracted
+and what gets executed. Furthermore the extracted files can also be installed
+by dune.
+
+The top-level `documentation` stanza for `mld` files can also be extended to
+support these options.
+
+----------
+
+## Requirements
+
+- Allow the errors to be highlighted in examples in the original file. Might require
+  https://github.com/ocaml/odoc/issues/147
+- Produce `.corrected` files to allow dune (or other build systems) to support
+  promotion of corrected files.
+- The code block name should contain the language information for syntax
+  highlighting.
+
+
+## Questions
+
+- Should odoc require code block annotations to be filenames with extension?
+  The extension could be used to identify the language and correctly do code
+  highlighting. On the other hand the code blocks could be annotated only with
+  the language name (*i.e.* `{ocaml[...]}`), but this would limit the scope of
+  the feature. In particular this would make it impossible to:
+  1. Write examples in code blocks that read input from files extracted from
+     other code blocks;
+  2. Explicitly select the examples that should be compiled (ignoring others);
+  3. Install multiple extracted examples without compiling them.
+- Should “execution” of `mli` files be supported too? Might be useful for basic
+  type-checking of the signature items.
+- Should code blocks with the same name from different `mli` and `mld` files
+  (in the same library) be extracted into the same file? This might be
+  problematic with anonymous code blocks. On the other hand the
+  `--with-line-numbers` can be used to keep track of the name of the original
+  file.
+
+
+## Alternatives
+
+- Examples could be loaded from existing files into odoc's output. This is more
+  limited than the current proposal because it does not allow to interleve
+  comments and code. But, on the other hand, would not need any additional
+  build tooling as the examples can be directly compiled/tested.
+- Introduce something like `mlt` files where code is mixed with comments. These
+  files could be converted into `mld` files for HTML rendering. See
+  https://github.com/janestreet/toplevel_expect_test
+
diff --git a/src/html/comment.ml b/src/html/comment.ml
@@ -251,7 +251,8 @@ let rec nestable_block_element
   fun ?xref_base_uri ~to_syntax ~from_syntax -> function
   | `Paragraph [{value = `Raw_markup (`Html, s); _}] -> Html.Unsafe.data s
   | `Paragraph content -> Html.p (inline_element_list ?xref_base_uri content)
-  | `Code_block s ->
+  | `Code_block (_, s) ->
+    (* TODO(rizo): use code block id as a CSS class. *)
     let open Tree in
     (*
     TODO: This will probably be replaced by a proper plugin / PPX system.

diff --git a/src/model/comment.ml b/src/model/comment.ml
@@ -44,7 +44,7 @@ type inline_element = [
 
 type nestable_block_element = [
   | `Paragraph of (inline_element with_location) list
-  | `Code_block of string
+  | `Code_block of string option * string
   | `Verbatim of string
   | `Modules of Reference.module_ list
   | `List of

diff --git a/src/parser/ast.ml b/src/parser/ast.ml
@@ -22,7 +22,7 @@ type inline_element = [
 
 type nestable_block_element = [
   | `Paragraph of (inline_element with_location) list
-  | `Code_block of string
+  | `Code_block of string option * string
   | `Verbatim of string
   | `Modules of Reference.module_ list
   | `List of

diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll
@@ -201,11 +201,11 @@ let emit_verbatim input start_offset buffer =
   let t = trim_trailing_blank_lines t in
   emit input (`Verbatim t) ~start_offset
 
-let code_block c =
+let code_block ?id c =
   let c = trim_leading_blank_lines c in
   let c = trim_trailing_blank_lines c in
   let c = trim_leading_whitespace c in
-  `Code_block c
+  `Code_block (id, c)
 
 
 
@@ -299,6 +299,9 @@ rule token input = parse
   | (reference_start as start) ([^ '}']* as target) '}'
     { emit input (reference_token start target) }
 
+  | '{' (['a'-'z' 'A'-'Z']+ as id) '[' (code_block_text as c) "]}"
+    { emit input (code_block ~id c) }
+
   | "{[" (code_block_text as c) "]}"
     { emit input (code_block c) }
 
@@ -461,7 +464,7 @@ rule token input = parse
         ~start_offset:(Lexing.lexeme_end lexbuf)
         (Parse_error.not_allowed
           ~what:(Token.describe `End)
-          ~in_what:(Token.describe (`Code_block "")));
+          ~in_what:(Token.describe (`Code_block (None, ""))));
       emit input (code_block c) }
 
 

diff --git a/src/parser/semantics.ml b/src/parser/semantics.ml
@@ -94,7 +94,7 @@ let rec nestable_block_element
   | {value = `Paragraph content; location} ->
     Location.at location (`Paragraph (inline_elements status content))
 
-  | {value = `Code_block _; _}
+  | {value = `Code_block (_, _); _}
   | {value = `Verbatim _; _}
   | {value = `Modules _; _} as element ->
     element

diff --git a/src/parser/syntax.ml b/src/parser/syntax.ml
@@ -855,7 +855,7 @@ let rec block_element_list
       let acc = block::acc in
       consume_block_elements ~parsed_a_tag `After_text acc
 
-    | {value = `Code_block s | `Verbatim s as token; location} as next_token ->
+    | {value = `Code_block (_, s) | `Verbatim s as token; location} as next_token ->
       warn_if_after_tags next_token;
       warn_if_after_text next_token;
       if s = "" then
@@ -865,7 +865,7 @@ let rec block_element_list
       junk input;
       let block =
         match token with
-        | `Code_block _ -> `Code_block s
+        | `Code_block (id, _) -> `Code_block (id, s)
         | `Verbatim _ -> `Verbatim s
       in
       let block = accepted_in_all_contexts context block in

diff --git a/src/parser/token.ml b/src/parser/token.ml
@@ -65,7 +65,7 @@ type t = [
   | `Begin_link_with_replacement_text of string
 
   (* Leaf block element markup. *)
-  | `Code_block of string
+  | `Code_block of string option * string
   | `Verbatim of string
   | `Modules of string
 

diff --git a/test/html/cases/markup.mli b/test/html/cases/markup.mli
@@ -88,6 +88,14 @@
           ignore foo
     ]}
 
+    Code blocks can have an identifier attached to them:
+
+    {ocaml[
-    {ocaml[
+    {print-two-plus-two[
-    {ocaml[
+    {print-two-plus-two[
+        # print_int (2 + 2);;
+        4
+        - : unit = ()
+    ]}
+
     There are also verbatim blocks:
 
 {v

diff --git a/test/parser/test.ml b/test/parser/test.ml
@@ -331,6 +331,9 @@ let tests : test_suite list = [
     t "unterminated" "{[foo";
     t "unterminated-bracket" "{[foo]";
     t "trailing-cr" "{[foo\r]}";
+    (* t "basic-with-id" "{foo[bar]}"; *)
+    (* t "empty-with-id" "{lang[]}"; *)
+    (* t "with-style-id" "{b[foo]}"; *)
   ];
 
   "verbatim", [

diff --git a/test/print/print.ml b/test/print/print.ml
@@ -275,7 +275,8 @@ struct
     function
     | `Paragraph es ->
       List [Atom "paragraph"; List (List.map (at inline_element) es)]
-    | `Code_block c -> List [Atom "code_block"; Atom c]
+    | `Code_block (Some id, c) -> List [Atom "code_block"; Atom id; Atom c]
+    | `Code_block (None, c) -> List [Atom "code_block"; Atom c]
     | `Verbatim t -> List [Atom "verbatim"; Atom t]
     | `Modules ps ->
       List [Atom "modules"; List (List.map Reference_to_sexp.reference ps)]