Skip to content

Commit

Permalink
Make pretty_print and trim better handle whitespace in xml (closes #78)
Browse files Browse the repository at this point in the history
  • Loading branch information
v-gb committed Feb 3, 2024
1 parent d77da22 commit f2ea38c
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 16 deletions.
30 changes: 23 additions & 7 deletions src/markup.mli
Original file line number Diff line number Diff line change
Expand Up @@ -665,10 +665,15 @@ val text : ([< signal ], 's) stream -> (char, 's) stream
[`Text ss] signal, the result stream has the bytes of the strings [ss], and
all other signals are ignored. *)

val trim : (signal, 's) stream -> (signal, 's) stream
(** Trims insignificant whitespace in an HTML signal stream. Whitespace around
flow ("block") content does not matter, but whitespace in phrasing
("inline") content does. So, if the input stream is
val trim :
?preserve_whitespace:(name -> (name * string) list -> bool) ->
(signal, 's) stream ->
(signal, 's) stream
(** Trims insignificant whitespace.
In an HTML signal stream, whitespace around flow ("block") content does not
matter, but whitespace in phrasing ("inline") content does. So, if the input
stream is
{[
<div>
Expand All @@ -684,7 +689,11 @@ val trim : (signal, 's) stream -> (signal, 's) stream
<div><p><em>foo</em> bar</p></div>
]}
Note that whitespace around the [</em>] tag was preserved. *)
Note that whitespace around the [</em>] tag was preserved.
In an XML stream, whitespace is assumed to be insignificant unless an
ancestor element has the xml:space=preserve attribute. You can specify
[?preserve_whitespace] to preserve more or less whitespace. *)

val normalize_text :
([> `Text of string list ] as 'a, 's) stream -> ('a, 's) stream
Expand All @@ -695,7 +704,10 @@ val normalize_text :
after parsing, or generating streams from scratch, and would like to clean
up the [`Text] signals. *)

val pretty_print : (signal, 's) stream -> (signal, 's) stream
val pretty_print :
?preserve_whitespace:(name -> (name * string) list -> bool)
-> (signal, 's) stream
-> (signal, 's) stream
(** Adjusts the whitespace in the [`Text] signals in the given stream so that
the output appears nicely-indented when the stream is converted to bytes and
written.
Expand All @@ -720,7 +732,11 @@ val pretty_print : (signal, 's) stream -> (signal, 's) stream
Note that no whitespace was inserted around [<em>] and [</em>], because
doing so would create a word break that wasn't present in the original
stream. *)
stream.
In an XML stream, whitespace is assumed to be insignificant unless an
ancestor element has the xml:space=preserve attribute. You can specify
[?preserve_whitespace] to control the whitespace changes. *)

val html5 : ([< signal ], 's) stream -> (signal, 's) stream
(** Converts a signal stream into an HTML5 signal stream by stripping any
Expand Down
21 changes: 12 additions & 9 deletions src/utility.ml
Original file line number Diff line number Diff line change
Expand Up @@ -214,9 +214,12 @@ let normalize_text s =

make match_other

let is_phrasing_element (namespace, element_name) =
let default_preserve_whitespace (namespace, element_name) attrs =
if namespace <> html_ns then
false
List.exists
(fun ((ns, key), value) ->
ns = xml_ns && key = "space" && value = "preserve")
attrs
else
match element_name with
| "a" | "abbr" | "b" | "bdi" | "bdo" | "br" | "button" | "cite" | "code"
Expand All @@ -235,17 +238,17 @@ let rec trim_string_list trim = function
| "" -> trim_string_list trim more
| s -> s::more

let trim signals =
let trim ?(preserve_whitespace = default_preserve_whitespace) signals =
let signals = normalize_text signals in

let signals_and_flow =
Kstream.transform begin fun phrasing_nesting_level signal _throw k ->
match signal with
| `Start_element (name, _) ->
| `Start_element (name, attrs) ->
if phrasing_nesting_level > 0 then
k ([signal, false], Some (phrasing_nesting_level + 1))
else
if is_phrasing_element name then
if preserve_whitespace name attrs then
k ([signal, false], Some 1)
else
k ([signal, true], Some 0)
Expand Down Expand Up @@ -295,8 +298,8 @@ let trim signals =

let tab_width = 1

let pretty_print signals =
let signals = trim signals in
let pretty_print ?(preserve_whitespace = default_preserve_whitespace) signals =
let signals = trim ~preserve_whitespace signals in

let indent n =
let n = if n < 0 then 0 else n in
Expand All @@ -308,7 +311,7 @@ let pretty_print signals =
and flow indentation throw e k =
next signals throw e begin fun signal ->
match signal with
| `Start_element (name, _) when not @@ is_phrasing_element name ->
| `Start_element (name, attrs) when not @@ preserve_whitespace name attrs ->
(* If the next signal is `End_element, don't insert a line break. This
is mainly for collapsing inherently empty tags like <meta> and
<br>. *)
Expand Down Expand Up @@ -352,7 +355,7 @@ let pretty_print signals =
and phrasing indentation phrasing_nesting_level throw e k =
next signals throw e begin fun signal ->
match signal with
| `Start_element (name, _) when is_phrasing_element name ->
| `Start_element (name, attrs) when preserve_whitespace name attrs ->
list
[signal]
(phrasing indentation (phrasing_nesting_level + 1)) throw e k
Expand Down
45 changes: 45 additions & 0 deletions test/test_utility.ml
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,51 @@ let tests = [
`End_element;
`Text ["\n"]]);

("utility.pretty_print.xml" >:: fun _ ->
let parse_print ?preserve_whitespace () =
Markup.string (String.trim {|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:r><w:rPr></w:rPr><w:t xml:space="preserve">a </w:t></w:r><w:r><w:rPr></w:rPr><w:t>b c</w:t></w:r>
</w:document>
|})
|> Markup.parse_xml
|> Markup.signals
|> pretty_print ?preserve_whitespace
|> Markup.write_xml
|> Markup.to_string
in
assert_equal ~printer:Fun.id (parse_print ()) (String.trim {|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:r>
<w:rPr/>
<w:t xml:space="preserve">a </w:t>
</w:r>
<w:r>
<w:rPr/>
<w:t>
b c
</w:t>
</w:r>
</w:document>
|} ^ "\n");
let preserve_whitespace name _ =
name = ("http://schemas.openxmlformats.org/wordprocessingml/2006/main", "t")
in
assert_equal ~printer:Fun.id (parse_print ~preserve_whitespace ()) (String.trim {|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:r>
<w:rPr/>
<w:t xml:space="preserve">a </w:t>
</w:r>
<w:r>
<w:rPr/>
<w:t>b c</w:t>
</w:r>
</w:document>
|} ^ "\n");
);

("utility.html5" >:: fun _ ->
[doctype;
doctype;
Expand Down

0 comments on commit f2ea38c

Please sign in to comment.