Markdown: preserve non-ASCII whitespaces at the end of the line and b…

…eginning of the next line (prettier#16619) Co-authored-by: fisker Cheung <[email protected]>
OKEAMAH · Sep 27, 2024 · a4be6a0 · a4be6a0
1 parent 9cf1c64
commit a4be6a0
Show file tree

Hide file tree

Showing 5 changed files with 112 additions and 6 deletions.
diff --git a/changelog_unreleased/markdown/16619.md b/changelog_unreleased/markdown/16619.md
@@ -0,0 +1,35 @@
+#### Preserve non-ASCII whitespaces at the end of the line and beginning of the next line (#16619 by @tats-u)
+
+Prettier removes non-ASCII spaces at the end of the line and beginning of the next line. However, this behavior is not consistent with the CommonMark spec.
+
+https://spec.commonmark.org/0.31.2/#soft-line-breaks
+
+> Spaces at the end of the line and beginning of the next line are removed:
+
+https://spec.commonmark.org/0.31.2/#unicode-whitespace-character
+
+> A Unicode whitespace character is a character in the Unicode Zs general category, or a tab (U+0009), line feed (U+000A), form feed (U+000C), or carriage return (U+000D).
+
+> Unicode whitespace is a sequence of one or more Unicode whitespace characters.
+
+> A space is U+0020.
+
+The CommonMark spec doesn't mention non-ASCII spaces here, so removing them changes the content of the Markdown document.
+
+<!-- prettier-ignore -->
+```md
+<!-- Input -->
+ EM Space (U+2003) EM Space 
+
+　全角スペース (U+3000) 全形空白　
+
+<!-- Prettier stable -->
+EM Space (U+2003) EM Space
+
+全角スペース (U+3000) 全形空白
+
+<!-- Prettier main -->
+ EM Space (U+2003) EM Space 
+
+　全角スペース (U+3000) 全形空白　
+```
diff --git a/cspell.json b/cspell.json
@@ -335,7 +335,8 @@
         "author: \".*?\"",
         "authorURL: \".*?\"",
         "\"author\": \".*?\"",
-        "(long|after){3,}"
+        "(long|after){3,}",
+        "\\\\u\\{[0-9a-fA-F]{4,6}\\}"
     ],
     "files": [
         "*",

diff --git a/eslint.config.js b/eslint.config.js
@@ -19,15 +19,15 @@ const toPath = (file) => url.fileURLToPath(new URL(file, import.meta.url));
 
 const ignores = `
 .tmp
+test*.*
 # Ignore directories and files in 'tests/format'
 tests/format/**/*
 # Unignore directories and 'jsfmt.spec.js', 'format.test.js' file
 !tests/format/**/
 !tests/format/**/format.test.js
-# TODO: Remove this in 2025
-!tests/format/**/jsfmt.spec.js
+# TODO: Remove this in 2025, somehow '!tests/format/**/jsfmt.spec.js' does not work
+!tests/format/**/jsfmt.*.js
 tests/integration/cli/
-test*.*
 scripts/release/node_modules
 coverage/
 dist*/

diff --git a/src/language-markdown/print-preprocess.js b/src/language-markdown/print-preprocess.js
@@ -1,3 +1,4 @@
+import htmlWhitespaceUtils from "../utils/html-whitespace-utils.js";
 import { getOrderedListItemInfo, mapAst, splitText } from "./utils.js";
 
 // 0x0 ~ 0x10ffff
@@ -72,11 +73,13 @@ function splitTextIntoSentences(ast) {
     let { value } = node;
 
     if (parentNode.type === "paragraph") {
+      // CommonMark doesn't remove trailing/leading \f, but it should be
+      // removed in the HTML rendering process
       if (index === 0) {
-        value = value.trimStart();
+        value = htmlWhitespaceUtils.trimStart(value);
       }
       if (index === parentNode.children.length - 1) {
-        value = value.trimEnd();
+        value = htmlWhitespaceUtils.trimEnd(value);
       }
     }
 

diff --git a/tests/format/markdown/trim-space/format.test.js b/tests/format/markdown/trim-space/format.test.js
@@ -0,0 +1,67 @@
+function leadingTestCase() {
+  const paragraphs = Array.from(
+    { length: 3 },
+    (_, i) => " ".repeat(i + 1) + "This is not a code block.\n",
+  );
+  return {
+    name: "Trim leading U+0020 less than 4",
+    code: paragraphs.join("\n"),
+    output: paragraphs.map((p) => p.replace(/^ +/u, "")).join("\n"),
+  };
+}
+
+function trailingTestCase() {
+  const line = "The trailing space not producing hard break should be removed.";
+  // trailing tab is not treated as 4 spaces because trailing spaces don't
+  // define block structure. (https://spec.commonmark.org/0.30/#tabs)
+  // 2 or more spaces are treated as hard line break.
+  const spaces = [" ", "\t", "\t\t", "\t \t "];
+  return {
+    name: "Trim trailing U+0020 or tab that don't produce hard break",
+    code: spaces.map((sp) => line + sp + "\n").join("\n"),
+    output: spaces.map(() => line + "\n").join("\n"),
+  };
+}
+
+function preserveVariantSpacesTestCase() {
+  const variantSpaces = `\v\vvertical tab\v\v
+
+\u{a0}\u{a0}NBSP\u{a0}\u{a0}
+
+\u{2002}\u{2002}en space\u{2002}\u{2002}
+
+\u{2003}\u{2003}em space\u{2003}\u{2003}
+
+\u{2004}\u{2004}1/3em\u{2004}\u{2004}
+
+\u{2005}\u{2005}1/4em\u{2005}\u{2005}
+
+\u{2028}\u{2028}line separator\u{2028}\u{2028}
+
+\u{2029}\u{2029}paragraph separator\u{2029}\u{2029}
+
+\u{3000}\u{3000}全角空白\u{3000}\u{3000}
+
+\u{feff}\u{feff}zero width NBSP\u{feff}\u{feff}
+`;
+  return {
+    name: "Preserve non-ASCII Unicode spaces / line terminators, and vertical tab",
+    code: variantSpaces,
+    output: variantSpaces,
+  };
+}
+
+runFormatTest(
+  {
+    importMeta: import.meta,
+    snippets: [
+      leadingTestCase(),
+      trailingTestCase(),
+      preserveVariantSpacesTestCase(),
+    ],
+  },
+  ["markdown"],
+  {
+    proseWrap: "always",
+  },
+);