From a8f968ce9f2de39bac69f1b5cc63b10308877856 Mon Sep 17 00:00:00 2001
From: Eric Joanis <eric.joanis@nrc-cnrc.gc.ca>
Date: Wed, 18 Sep 2024 14:00:52 -0400
Subject: [PATCH] fix: use rich.print thoughout the wizard for wrapping and
 highlighting

Fixes formatting in many places.
Adds nice highlighting in some.
But, do not redefine builtin print, use rich_print to be clear everywhere.

Fixes #546: question titles being truncated instead of wrapped.
---
 everyvoice/tests/test_wizard.py               | 29 +++++-----
 everyvoice/wizard/__init__.py                 |  9 ++-
 everyvoice/wizard/basic.py                    | 32 +++++-----
 everyvoice/wizard/dataset.py                  | 58 ++++++++++---------
 everyvoice/wizard/prompts.py                  |  5 +-
 .../wizard/simple_term_menu_win_stub.py       |  9 ++-
 everyvoice/wizard/validators.py               | 14 +++--
 7 files changed, 85 insertions(+), 71 deletions(-)

diff --git a/everyvoice/tests/test_wizard.py b/everyvoice/tests/test_wizard.py
index 0cdc9a8c..b191fa50 100644
--- a/everyvoice/tests/test_wizard.py
+++ b/everyvoice/tests/test_wizard.py
@@ -1,6 +1,7 @@
 """Unit tests for the wizard module"""
 
 import os
+import re
 import string
 import tempfile
 from enum import Enum
@@ -266,7 +267,7 @@ def test_bad_contact_email_step(self):
             self.assertFalse(step.validate("test@test."))
             self.assertTrue(step.validate("test@test.ca"))
             self.assertFalse(step.validate(""))
-        output = stdout.getvalue()
+        output = stdout.getvalue().replace(" \n", " ")
         # Supporting email-validator prior and post 2.2.0 where the error string changed.
         self.assertTrue(
             "It must have exactly one @-sign" in output
@@ -382,10 +383,10 @@ def test_dataset_name(self):
         with patch_input(("", "bad/name", "good-name"), True):
             with capture_stdout() as stdout:
                 step.run()
-        output = stdout.getvalue().split("\n")
+        output = stdout.getvalue().replace(" \n", " ").split("\n")
         self.assertIn("your dataset needs a name", output[0])
         self.assertIn("is not valid", output[1])
-        self.assertIn("finished the configuration", output[2])
+        self.assertIn("finished the configuration", "".join(output[2:]))
         self.assertTrue(step.completed)
 
     def test_speaker_name(self):
@@ -393,10 +394,10 @@ def test_speaker_name(self):
         with patch_input(("", "bad/name", "good-name"), True):
             with capture_stdout() as stdout:
                 step.run()
-        output = stdout.getvalue().split("\n")
+        output = stdout.getvalue().replace(" \n", " ").split("\n")
         self.assertIn("speaker needs an ID", output[0])
         self.assertIn("is not valid", output[1])
-        self.assertIn("will be used as the speaker ID", output[2])
+        self.assertIn("will be used as the speaker ID", "".join(output[2:]))
         self.assertTrue(step.completed)
 
     def test_wavs_dir(self):
@@ -430,7 +431,7 @@ def test_sample_rate_config(self):
         ):
             with capture_stdout() as stdout:
                 step.run()
-        output = stdout.getvalue().split("\n")
+        output = stdout.getvalue().replace(" \n", " ").split(".\n")
         for i in range(4):
             self.assertIn("not a valid sample rate", output[i])
         self.assertTrue(step.completed)
@@ -605,7 +606,7 @@ def test_dataset_subtour(self):
         with patch_menu_prompt(1), capture_stdout() as out:
             validate_wavs_step.run()
         self.assertEqual(step.state[SN.validate_wavs_step][:2], "No")
-        self.assertIn("Warning: 4 wav files were not found", out.getvalue())
+        self.assertRegex(out.getvalue(), "Warning: .*4.* wav files were not found")
 
         text_processing_step = find_step(SN.text_processing_step, tour.steps)
         # 0 is lowercase, 1 is NFC Normalization, select both
@@ -712,9 +713,9 @@ def test_wrong_fileformat_psv(self):
         # try with: 1/tsv (wrong), 2/csv (wrong), 3/festival (wrong) and finally 0 tsv (right)
         with patch_menu_prompt((1, 2, 3, 0), multi=True) as stdout:
             format_step.run()
-        output = stdout.getvalue()
-        self.assertIn("does not look like a 'tsv'", output)
-        self.assertIn("does not look like a 'csv'", output)
+        output = re.sub(r" *\n", " ", stdout.getvalue())
+        self.assertRegex(output, "does not look like a .*'tsv'")
+        self.assertRegex(output, "does not look like a .*'csv'")
         self.assertIn("is not in the festival format", output)
         self.assertTrue(format_step.completed)
         # print(format_step.state)
@@ -736,10 +737,10 @@ def test_wrong_fileformat_festival(self):
         # try with: 0/psv (wrong), 1/tsv (wrong), 2/csv (wrong), and finally 3/festival (right)
         with patch_menu_prompt((0, 1, 2, 3), multi=True) as stdout:
             format_step.run()
-        output = stdout.getvalue()
-        self.assertIn("does not look like a 'psv'", output)
-        self.assertIn("does not look like a 'tsv'", output)
-        self.assertIn("does not look like a 'csv'", output)
+        output = stdout.getvalue().replace(" \n", " ")
+        self.assertRegex(output, "does not look like a .*'psv'")
+        self.assertRegex(output, "does not look like a .*'tsv'")
+        self.assertRegex(output, "does not look like a .*'csv'")
         self.assertTrue(format_step.completed)
         # print(format_step.state)
 
diff --git a/everyvoice/wizard/__init__.py b/everyvoice/wizard/__init__.py
index a86cc80a..5b2aa68b 100644
--- a/everyvoice/wizard/__init__.py
+++ b/everyvoice/wizard/__init__.py
@@ -6,6 +6,7 @@
 from typing import Optional, Sequence
 
 from anytree import RenderTree
+from rich import print as rich_print
 
 from .utils import EnumDict as State
 from .utils import NodeMixinWithNavigation
@@ -112,7 +113,9 @@ def run(self):
         else:
             self._validation_failures += 1
             if self._validation_failures > 20:
-                print(f"ERROR: {self.name} giving up after 20 validation failures.")
+                rich_print(
+                    f"ERROR: {self.name} giving up after 20 validation failures."
+                )
                 sys.exit(1)
             self.run()
 
@@ -180,7 +183,7 @@ def run(self):
             try:
                 node.run()
             except KeyboardInterrupt:
-                print("\nKeyboard Interrupt")
+                rich_print("\nKeyboard Interrupt")
                 node = self.keyboard_interrupt_action(node)
                 continue
             node = node.next()
@@ -191,7 +194,7 @@ def visualize(self, highlight: Optional[Step] = None):
             treestr = f"{pre}{node.name}"
             if node == highlight:
                 treestr += "        <========"
-            print(treestr.ljust(8))
+            rich_print(treestr.ljust(8))
 
 
 class StepNames(Enum):
diff --git a/everyvoice/wizard/basic.py b/everyvoice/wizard/basic.py
index 4ddb3209..9504a96f 100644
--- a/everyvoice/wizard/basic.py
+++ b/everyvoice/wizard/basic.py
@@ -3,7 +3,7 @@
 
 import questionary
 from email_validator import EmailNotValidError, validate_email
-from rich import print
+from rich import print as rich_print
 from rich.panel import Panel
 from rich.style import Style
 
@@ -51,18 +51,18 @@ def prompt(self):
 
     def validate(self, response):
         if len(response) == 0:
-            print("Sorry, your project needs a name. ")
+            rich_print("Sorry, your project needs a name. ")
             return False
         sanitized_path = slugify(response)
         if not sanitized_path == response:
-            print(
+            rich_print(
                 f"Sorry, the project name '{response}' is not valid, since it will be used to create a folder and special characters are not permitted for folder names. Please re-type something like '{sanitized_path}' instead."
             )
             return False
         return True
 
     def effect(self):
-        print(
+        rich_print(
             f"Great! Launching Configuration Wizard 🧙 for project named '{self.response}'."
         )
 
@@ -77,12 +77,12 @@ def validate(self, response):
         # Some languages don't use first and last names, so we can't necessarily check that response.split() > 1
         # It would be nice to have a better check here though.
         if len(response) < 3:
-            print("Sorry, EveryVoice requires a name to help prevent misuse.")
+            rich_print("Sorry, EveryVoice requires a name to help prevent misuse.")
             return False
         return True
 
     def effect(self):
-        print(f"Great! Nice to meet you, '{self.response}'.")
+        rich_print(f"Great! Nice to meet you, '{self.response}'.")
 
 
 class ContactEmailStep(Step):
@@ -112,8 +112,8 @@ def validate(self, response):
         except EmailNotValidError as e:
             # The exception message is a human-readable explanation of why it's
             # not a valid (or deliverable) email address.
-            print("EveryVoice requires a valid email address to prevent misuse.")
-            print(str(e))
+            rich_print("EveryVoice requires a valid email address to prevent misuse.")
+            rich_print(str(e))
             return False
         return True
 
@@ -121,7 +121,7 @@ def effect(self):
         emailinfo = validate_email(self.response, check_deliverability=False)
         email = emailinfo.normalized
         self.response = email
-        print(
+        rich_print(
             f"Great! Your contact email '{self.response}' will be saved to your models."
         )
 
@@ -154,7 +154,7 @@ def can_mkdir(self, path: Path) -> bool:
                     d.mkdir()
                     dirs_made.append(d)
                 except OSError as e:
-                    print(f"Sorry, could not create '{d}': {e}.")
+                    rich_print(f"Sorry, could not create '{d}': {e}.")
                     return False
         finally:
             for d in reversed(dirs_made):
@@ -164,12 +164,12 @@ def can_mkdir(self, path: Path) -> bool:
     def validate(self, response) -> bool:
         path = Path(response)
         if path.is_file():
-            print(f"Sorry, '{path}' is a file. Please select a directory.")
+            rich_print(f"Sorry, '{path}' is a file. Please select a directory.")
             return False
         assert self.state is not None, "OutputPathStep requires NameStep"
         output_path = path / self.state.get(StepNames.name_step, "DEFAULT_NAME")
         if output_path.exists():
-            print(
+            rich_print(
                 f"Sorry, '{output_path}' already exists. "
                 "Please choose another output directory or start again and choose a different project name."
             )
@@ -178,14 +178,14 @@ def validate(self, response) -> bool:
         # We create the output directory in validate() instead of effect() so that
         # failure can be reported to the user and the question asked again if necessary.
         if not self.can_mkdir(output_path):
-            print("Please choose another output directory.")
+            rich_print("Please choose another output directory.")
             return False
 
         self.output_path = output_path
         return True
 
     def effect(self):
-        print(
+        rich_print(
             f"The Configuration Wizard 🧙 will put your files here: '{self.output_path}'"
         )
 
@@ -463,7 +463,7 @@ def effect(self):
                 (config_dir / e2e_config_path).absolute(),
             )
 
-        print(
+        rich_print(
             Panel(
                 f"You've finished configuring your dataset. Your files are located at {config_dir.absolute()}",
                 title="Congratulations 🎉",
@@ -505,7 +505,7 @@ def effect(self):
                 self,
             )
         elif len([key for key in self.state.keys() if key.startswith("dataset_")]) == 0:
-            print("No dataset to save, exiting without saving any configuration.")
+            rich_print("No dataset to save, exiting without saving any configuration.")
         else:
             self.tour.add_step(
                 ConfigFormatStep(name=StepNames.config_format_step), self
diff --git a/everyvoice/wizard/dataset.py b/everyvoice/wizard/dataset.py
index 0d4e8428..0de7c2a0 100644
--- a/everyvoice/wizard/dataset.py
+++ b/everyvoice/wizard/dataset.py
@@ -7,7 +7,7 @@
 from typing import Sequence
 
 import questionary
-import rich
+from rich import print as rich_print
 from rich.panel import Panel
 from rich.style import Style
 from tqdm import tqdm
@@ -40,18 +40,18 @@ def prompt(self):
 
     def validate(self, response):
         if len(response) == 0:
-            print("Sorry, your dataset needs a name.")
+            rich_print("Sorry, your dataset needs a name.")
             return False
         slug = slugify(response)
         if not slug == response:
-            print(
+            rich_print(
                 f"Sorry, your name: '{response}' is not valid, since it will be used to create a file and special characters are not permitted in filenames. Please re-type something like {slug} instead."
             )
             return False
         return True
 
     def effect(self):
-        print(
+        rich_print(
             f"Great! The Configuration Wizard 🧙 finished the configuration for your dataset named '{self.response}'"
         )
 
@@ -75,7 +75,7 @@ def validate(self, response):
 
     def effect(self):
         if self.state[StepNames.dataset_permission_step.value].startswith("No"):
-            print("OK, we'll ask you to choose another dataset then!")
+            rich_print("OK, we'll ask you to choose another dataset then!")
             self.children = []
             del self.root.state[self.state_subset]
 
@@ -101,7 +101,7 @@ def validate(self, response) -> bool:
         glob_iter = glob.iglob(os.path.join(path_expanded, "**/*.wav"), recursive=True)
         contains_wavs = next(glob_iter, None) is not None
         if not contains_wavs:
-            print(
+            rich_print(
                 f"Sorry, no .wav files were found in '{path_expanded}'. Please choose a directory with audio files."
             )
         return valid_path and contains_wavs
@@ -120,13 +120,13 @@ def validate(self, response):
         try:
             self.response = int(response)
             if self.response < 100 or float(response) != self.response:
-                print(
+                rich_print(
                     f"{response} is not a valid sample rate. Please enter an integer representing the sample rate in Hertz of your data."
                 )
                 return False
             return True
         except ValueError:
-            print(
+            rich_print(
                 f"{response} is not a valid sample rate. Please enter an integer representing the sample rate in Hertz of your data."
             )
             return False
@@ -167,18 +167,18 @@ def looks_like_sv(self, file_type, separator) -> bool:
         if len(initial_records) > 0:
             column_count = len(initial_records[0])
         else:
-            print(f"ERROR: File ({filelist_path} is empty. Please double check.")
+            rich_print(f"ERROR: File ({filelist_path} is empty. Please double check.")
             sys.exit(1)
 
         if column_count < 2:
-            print(
+            rich_print(
                 f"File '{filelist_path}' does not look like a '{file_type}' file: no record separator found on header line."
             )
             return False
 
         for i, record in enumerate(initial_records):
             if len(record) != column_count:
-                print(
+                rich_print(
                     f"File '{filelist_path}' does not look like a '{file_type}' file: the {i}th record has a different number of fields than the header row."
                 )
                 return False
@@ -192,7 +192,7 @@ def validate(self, response):
                 _ = read_festival(filelist_path, 10)
                 return True
             except ValueError:
-                print(f"File '{filelist_path}' is not in the festival format.")
+                rich_print(f"File '{filelist_path}' is not in the festival format.")
                 return False
 
         separator = self.separators.get(response, None)
@@ -280,13 +280,13 @@ def wav_file_early_validation(self) -> int:
         file_list_size = len(filelist_data)
         sample: Sequence[int]
         if file_list_size > MAX_SAMPLES:
-            print(
+            rich_print(
                 f"Checking a sample of {MAX_SAMPLES} of your audio files to make sure they are present."
             )
             sampled_text = " sampled"
             sample = sorted(random.sample(range(file_list_size), MAX_SAMPLES))
         else:
-            print("Checking if all your audio files are present.")
+            rich_print("Checking if all your audio files are present.")
             sampled_text = ""
             sample = range(file_list_size)
         for item in sample:
@@ -300,15 +300,17 @@ def wav_file_early_validation(self) -> int:
         if files_not_found:
             n = len(files_not_found)
             if n == 1:
-                print(
+                rich_print(
                     f"Warning: wav file '{files_not_found[0]}' was not found, please check your filelist."
                 )
             else:
-                print(
+                rich_print(
                     f"Warning: {n}{sampled_text} wav files were not found, including '{files_not_found[0]}' and '{files_not_found[1]}'.\nPlease check your wavs directory '{wavs_dir}' and your filelist."
                 )
             return n
-        print(f"Great! All{sampled_text} audio files found in directory '{wavs_dir}'.")
+        rich_print(
+            f"Great! All{sampled_text} audio files found in directory '{wavs_dir}'."
+        )
         return 0
 
     def prompt(self):
@@ -337,7 +339,7 @@ def effect(self):
                 self,
             )
         elif self.response.startswith("No"):
-            rich.print(
+            rich_print(
                 Panel(
                     "Continuing despite missing audio files. Make sure you fix your filelist later or add missing audio files, otherwise entries in your filelist with missing audio files will be skipped during preprocessing and therefore be ignored during training.",
                     title="Missing audio files",
@@ -462,7 +464,7 @@ def validate(self, response):
 
     def effect(self):
         if self.state[StepNames.data_has_header_line_step] == "no":
-            print("Reinterpreting your first row as a record, not headers.")
+            rich_print("Reinterpreting your first row as a record, not headers.")
             self.state["filelist_data_list"].insert(
                 0, self.state["filelist_data_list"][0]
             )
@@ -478,7 +480,7 @@ def prompt(self):
         elif len(self.state.get("selected_headers", [])) >= len(
             self.state["filelist_data_list"][0]
         ):
-            print("No columns left, we will assume you have no speaker column.")
+            rich_print("No columns left, we will assume you have no speaker column.")
             return "no"
         else:
             return get_response_from_menu_prompt(
@@ -490,7 +492,7 @@ def validate(self, response):
         return response in self.choices
 
     def effect(self):
-        print(
+        rich_print(
             "Note: if your dataset has speakers with names matching with speakers from other provided datasets, they will be considered the same. If this is not the desired behaviour, you will have to alter the speaker IDs in the relevant datasets to indicate that they are different."
         )
         if self.state[StepNames.data_has_speaker_value_step] == "yes":
@@ -527,7 +529,7 @@ def effect(self):
         else:
             # Even though AddSpeakerStep is not run, the speaker ID is assigned to its keyword
             self.state[StepNames.add_speaker_step] = f"speaker_{self.dataset_index}"
-            print(
+            rich_print(
                 f"OK, '{self.state[StepNames.add_speaker_step]}' will be used as a speaker ID in this dataset then."
             )
 
@@ -540,18 +542,18 @@ def prompt(self):
 
     def validate(self, response):
         if len(response) == 0:
-            print("Sorry, the speaker needs an ID.")
+            rich_print("Sorry, the speaker needs an ID.")
             return False
         slug = slugify(response)
         if not slug == response:
-            print(
+            rich_print(
                 f"Sorry, your ID: '{response}' is not valid. Please avoid using special characters in it and re-type something like {slug} instead."
             )
             return False
         return True
 
     def effect(self):
-        print(
+        rich_print(
             f"Great! '{self.response}' will be used as the speaker ID for this dataset."
         )
 
@@ -566,7 +568,7 @@ def prompt(self):
         elif len(self.state.get("selected_headers", [])) >= len(
             self.state["filelist_data_list"][0]
         ):
-            print("No columns left, we will assume you have no language column.")
+            rich_print("No columns left, we will assume you have no language column.")
             return "no"
         else:
             return get_response_from_menu_prompt(
@@ -602,7 +604,7 @@ def prompt(self):
         from everyvoice.text.phonemizer import AVAILABLE_G2P_ENGINES
 
         g2p_langs_full = get_arpabet_langs()[1]
-        print(
+        rich_print(
             "Note: if your dataset has more than one language in it, you will have to provide a 'language' column to indicate the language of each sample, because the configuration wizard can't guess!"
         )
         # TODO: currently we only support the languages from g2p, but we should add more
@@ -810,7 +812,7 @@ def prompt(self):
         # TODO: This is a bit of a weird step, since it doesn't really prompt anything, it just applies the effect of trying to find
         #       character graphemes/phones. I'd still like to keep it here, since we might add more to this step in the future, and
         #       I don't want to lump the grapheme clustering logic into the effect of another step.
-        print(
+        rich_print(
             f"We will now read your entire dataset and try to determine the characters and/or phones in your dataset according to Unicode Grapheme clustering rules. Please carefully check your {TEXT_CONFIG_FILENAME_PREFIX}.yaml file (which is created at the end of the wizard) and adjust the symbol set as appropriate. If your language uses standard punctuation symbols to represent sounds, it is extra important that you go remove any of these symbols from the punctuation categories."
         )
         return True
diff --git a/everyvoice/wizard/prompts.py b/everyvoice/wizard/prompts.py
index b3ea8d70..ba06383a 100644
--- a/everyvoice/wizard/prompts.py
+++ b/everyvoice/wizard/prompts.py
@@ -62,9 +62,12 @@ def get_response_from_menu_prompt(
     """
     if prompt_text:
         rich.print(Panel(prompt_text))
+    # using TerminalMenu's title parameter truncates the title to the width of
+    # the menu instead of wrapping it, so we use rich.print instead.
+    if title:
+        rich.print(title)
     menu = simple_term_menu.TerminalMenu(
         choices,
-        title=title,
         multi_select=multi,
         multi_select_select_on_accept=(not multi),
         multi_select_empty_ok=multi,
diff --git a/everyvoice/wizard/simple_term_menu_win_stub.py b/everyvoice/wizard/simple_term_menu_win_stub.py
index 81b1adbf..89f73d03 100644
--- a/everyvoice/wizard/simple_term_menu_win_stub.py
+++ b/everyvoice/wizard/simple_term_menu_win_stub.py
@@ -31,21 +31,20 @@ class TerminalMenu:
     It's not nearly as pretty as what we do on Linux with the real thing, but
     it's good enough for development and testing."""
 
-    def __init__(self, choices, title, multi_select, **_kwargs):
+    def __init__(self, choices, multi_select, **_kwargs):
         self.choices = choices
-        self.title = title
         self.multi_select = multi_select
 
     def show(self):
         if self.multi_select:
             responses = questionary.checkbox(
-                self.title,
-                self.choices,
+                message="",
+                choices=self.choices,
             ).unsafe_ask()
             return [self.choices.index(response) for response in responses]
         else:
             response = questionary.select(
-                self.title,
+                message="",
                 choices=self.choices,
             ).unsafe_ask()
             return self.choices.index(response)
diff --git a/everyvoice/wizard/validators.py b/everyvoice/wizard/validators.py
index 12f78873..f7550249 100644
--- a/everyvoice/wizard/validators.py
+++ b/everyvoice/wizard/validators.py
@@ -1,5 +1,7 @@
 from pathlib import Path
 
+from rich import print as rich_print
+
 
 def validate_path(value: str, is_dir=False, is_file=False, exists=False):
     path = Path(value).expanduser()
@@ -8,15 +10,19 @@ def validate_path(value: str, is_dir=False, is_file=False, exists=False):
             "The path must be either a file or directory, but both or neither were specified."
         )
     if is_dir and path.is_file():
-        print(f"Sorry, the path '{path}' is a file. Please select a directory.")
+        rich_print(f"Sorry, the path '{path}' is a file. Please select a directory.")
         return False
     if is_file and path.is_dir():
-        print(f"Sorry, the path '{path}' is a directory. Please select a file.")
+        rich_print(f"Sorry, the path '{path}' is a directory. Please select a file.")
         return False
     if not exists and path.exists():
-        print(f"Sorry, the path '{path}' already exists. Please choose another path.")
+        rich_print(
+            f"Sorry, the path '{path}' already exists. Please choose another path."
+        )
         return False
     if exists and not path.exists():
-        print(f"Sorry, the path '{path}' doesn't exist. Please choose another path.")
+        rich_print(
+            f"Sorry, the path '{path}' doesn't exist. Please choose another path."
+        )
         return False
     return True