From c5a2055d60354d8e4cbdeb9ff8a947b4b5b0da04 Mon Sep 17 00:00:00 2001
From: Steven Kearnes <skearnes@relaytx.com>
Date: Fri, 12 Jul 2024 10:36:54 -0400
Subject: [PATCH 1/2] Make dataset name and description required

---
 ord_schema/scripts/enumerate_dataset.py      | 12 +++++++--
 ord_schema/scripts/enumerate_dataset_test.py |  6 ++++-
 ord_schema/scripts/process_dataset_test.py   | 22 +++++++++--------
 ord_schema/scripts/validate_dataset_test.py  |  4 +--
 ord_schema/templating.py                     |  8 ++++--
 ord_schema/templating_test.py                | 24 +++++++++---------
 ord_schema/validations.py                    |  4 +++
 ord_schema/validations_test.py               | 26 +++++++++++++++++---
 8 files changed, 74 insertions(+), 32 deletions(-)
diff --git a/ord_schema/scripts/enumerate_dataset.py b/ord_schema/scripts/enumerate_dataset.py
index 71c993c4..2835fc1a 100644
--- a/ord_schema/scripts/enumerate_dataset.py
+++ b/ord_schema/scripts/enumerate_dataset.py
@@ -14,9 +14,11 @@
 """Creates a Dataset by enumerating a template with a spreadsheet.
 
 Usage:
-    dataset_templating.py --template=<str> --spreadsheet=<str> --output=<str> [--no-validate]
+    dataset_templating.py --name=<str> --description=<str> --template=<str> --spreadsheet=<str> --output=<str> [--no-validate]
 
 Options:
+    --name=<str>            Dataset name
+    --description=<str>     Dataset description
     --template=<str>        Path to a Reaction pbtxt file defining a template
     --spreadsheet=<str>     Path to a spreadsheet file with a header row matching template placeholders
     --output=<str>          Filename for output Dataset
@@ -39,7 +41,13 @@ def main(kwargs):
         kwargs["--template"],
         kwargs["--spreadsheet"],
     )
-    dataset = templating.generate_dataset(template_string, df, validate=not kwargs["--no-validate"])
+    dataset = templating.generate_dataset(
+        name=kwargs["--name"],
+        description=kwargs["--description"],
+        template_string=template_string,
+        df=df,
+        validate=not kwargs["--no-validate"],
+    )
     logger.info("writing new Dataset to %s", kwargs["--output"])
     message_helpers.write_message(dataset, kwargs["--output"])
 
diff --git a/ord_schema/scripts/enumerate_dataset_test.py b/ord_schema/scripts/enumerate_dataset_test.py
index d52c5517..276a150b 100644
--- a/ord_schema/scripts/enumerate_dataset_test.py
+++ b/ord_schema/scripts/enumerate_dataset_test.py
@@ -91,7 +91,7 @@ def setup(tmp_path) -> tuple[str, str, str]:
     )
     spreadsheet_filename = os.path.join(dirname, "spreadsheet.csv")
     data.to_csv(spreadsheet_filename, index=False)
-    expected = dataset_pb2.Dataset()
+    expected = dataset_pb2.Dataset(name="test", description="test")
     reaction1 = expected.reactions.add()
     reaction1_compound1 = reaction1.inputs["test"].components.add()
     reaction1_compound1.identifiers.add(value="C", type="SMILES")
@@ -132,6 +132,10 @@ def test_main(setup, tmp_path):
     template_filename, spreadsheet_filename, expected = setup
     output_filename = (tmp_path / "dataset.pbtxt").as_posix()
     argv = [
+        "--name",
+        "test",
+        "--description",
+        "test",
         "--template",
         template_filename,
         "--spreadsheet",
diff --git a/ord_schema/scripts/process_dataset_test.py b/ord_schema/scripts/process_dataset_test.py
index 9eae1d70..75d5efdb 100644
--- a/ord_schema/scripts/process_dataset_test.py
+++ b/ord_schema/scripts/process_dataset_test.py
@@ -48,6 +48,8 @@ def setup(self, tmp_path) -> tuple[str, str]:
         reaction1.provenance.record_created.person.username = "test"
         reaction1.provenance.record_created.person.email = "test@example.com"
         dataset1 = dataset_pb2.Dataset(
+            name="test1",
+            description="test1",
             dataset_id="ord_dataset-00000000000000000000000000000000",
             reactions=[reaction1],
         )
@@ -55,7 +57,7 @@ def setup(self, tmp_path) -> tuple[str, str]:
         message_helpers.write_message(dataset1, dataset1_filename)
         # reaction2 is empty.
         reaction2 = reaction_pb2.Reaction()
-        dataset2 = dataset_pb2.Dataset(reactions=[reaction1, reaction2])
+        dataset2 = dataset_pb2.Dataset(name="test2", description="test2", reactions=[reaction1, reaction2])
         dataset2_filename = (tmp_path / "dataset2.pb").as_posix()
         message_helpers.write_message(dataset2, dataset2_filename)
         yield dataset1_filename, dataset2_filename
@@ -140,7 +142,7 @@ def setup(self, tmp_path) -> tuple[str, str]:
         reaction.provenance.record_created.person.email = "test@example.com"
         reaction.reaction_id = "ord-10aed8b5dffe41fab09f5b2cc9c58ad9"
         dataset_id = "ord_dataset-64b14868c5cd46dd8e75560fd3589a6b"
-        dataset = dataset_pb2.Dataset(reactions=[reaction], dataset_id=dataset_id)
+        dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction], dataset_id=dataset_id)
         # Make sure the initial dataset is valid.
         validations.validate_message(dataset)
         os.makedirs(os.path.join("data", "64"))
@@ -213,7 +215,7 @@ def test_add_dataset(self, setup):
         reaction.provenance.record_created.person.username = "test"
         reaction.provenance.record_created.person.email = "test@example.com"
         reaction.reaction_id = "test"
-        dataset = dataset_pb2.Dataset(reactions=[reaction])
+        dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction])
         this_dataset_filename = os.path.join(test_subdirectory, "test.pbtxt")
         message_helpers.write_message(dataset, this_dataset_filename)
         added, removed, changed, filenames = self._run(test_subdirectory)
@@ -246,14 +248,14 @@ def test_add_sharded_dataset(self, setup):
         reaction.provenance.record_created.person.username = "test2"
         reaction.provenance.record_created.person.email = "test2@example.com"
         reaction.reaction_id = "test1"
-        dataset1 = dataset_pb2.Dataset(reactions=[reaction])
+        dataset1 = dataset_pb2.Dataset(name="test1", description="test1", reactions=[reaction])
         dataset1_filename = os.path.join(test_subdirectory, "test1.pbtxt")
         message_helpers.write_message(dataset1, dataset1_filename)
         reaction.provenance.record_created.time.value = "2020-01-03"
         reaction.provenance.record_created.person.username = "test3"
         reaction.provenance.record_created.person.email = "test3@example.com"
         reaction.reaction_id = "test2"
-        dataset2 = dataset_pb2.Dataset(reactions=[reaction])
+        dataset2 = dataset_pb2.Dataset(name="test2", description="test2", reactions=[reaction])
         dataset2_filename = os.path.join(test_subdirectory, "test2.pbtxt")
         message_helpers.write_message(dataset2, dataset2_filename)
         added, removed, changed, filenames = self._run(test_subdirectory)
@@ -284,7 +286,7 @@ def test_add_dataset_with_existing_reaction_ids(self, setup):
         reaction.provenance.record_created.time.value = "2020-01-01"
         reaction.provenance.record_created.person.username = "test"
         reaction.provenance.record_created.person.email = "test@example.com"
-        dataset = dataset_pb2.Dataset(reactions=[reaction])
+        dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction])
         this_dataset_filename = os.path.join(test_subdirectory, "test.pbtxt")
         message_helpers.write_message(dataset, this_dataset_filename)
         added, removed, changed, filenames = self._run(test_subdirectory)
@@ -354,7 +356,7 @@ def test_add_dataset_with_validation_errors(self, setup):
         component.amount.moles.value = 2
         component.amount.moles.units = reaction_pb2.Moles.MILLIMOLE
         reaction.outcomes.add().conversion.value = 25
-        dataset = dataset_pb2.Dataset(reactions=[reaction])
+        dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction])
         dataset_filename = os.path.join(test_subdirectory, "test.pbtxt")
         message_helpers.write_message(dataset, dataset_filename)
         with pytest.raises(validations.ValidationError, match="could not validate SMILES"):
@@ -373,11 +375,11 @@ def test_add_sharded_dataset_with_validation_errors(self, setup):
         reaction.provenance.record_created.time.value = "2021-02-09"
         reaction.provenance.record_created.person.username = "bob"
         reaction.provenance.record_created.person.email = "bob@bob.com"
-        dataset1 = dataset_pb2.Dataset(reactions=[reaction])
+        dataset1 = dataset_pb2.Dataset(name="test1", description="test2", reactions=[reaction])
         dataset1_filename = os.path.join(test_subdirectory, "test1.pbtxt")
         message_helpers.write_message(dataset1, dataset1_filename)
         reaction.inputs["ethylamine"].components[0].identifiers[0].value = "#"
-        dataset2 = dataset_pb2.Dataset(reactions=[reaction])
+        dataset2 = dataset_pb2.Dataset(name="test2", description="test2", reactions=[reaction])
         dataset2_filename = os.path.join(test_subdirectory, "test2.pbtxt")
         message_helpers.write_message(dataset2, dataset2_filename)
         with pytest.raises(validations.ValidationError, match="could not validate SMILES"):
@@ -408,7 +410,7 @@ def test_add_dataset_with_too_large_reaction(self, setup):
         reaction.provenance.record_created.time.value = "2023-07-01"
         reaction.provenance.record_created.person.name = "test"
         reaction.provenance.record_created.person.email = "test@example.com"
-        dataset = dataset_pb2.Dataset(reactions=[reaction])
+        dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction])
         dataset_filename = os.path.join(test_subdirectory, "test.pbtxt")
         message_helpers.write_message(dataset, dataset_filename)
         with pytest.raises(ValueError, match="larger than --max_size"):
diff --git a/ord_schema/scripts/validate_dataset_test.py b/ord_schema/scripts/validate_dataset_test.py
index 16231d36..18768fbb 100644
--- a/ord_schema/scripts/validate_dataset_test.py
+++ b/ord_schema/scripts/validate_dataset_test.py
@@ -38,12 +38,12 @@ def setup(tmp_path) -> str:
     reaction1.provenance.record_created.time.value = "2023-07-01"
     reaction1.provenance.record_created.person.name = "test"
     reaction1.provenance.record_created.person.email = "test@example.com"
-    dataset1 = dataset_pb2.Dataset(reactions=[reaction1])
+    dataset1 = dataset_pb2.Dataset(name="test1", description="test1", reactions=[reaction1])
     dataset1_filename = os.path.join(test_subdirectory, "dataset1.pbtxt")
     message_helpers.write_message(dataset1, dataset1_filename)
     # reaction2 is empty.
     reaction2 = reaction_pb2.Reaction()
-    dataset2 = dataset_pb2.Dataset(reactions=[reaction1, reaction2])
+    dataset2 = dataset_pb2.Dataset(name="test2", description="test2", reactions=[reaction1, reaction2])
     dataset2_filename = os.path.join(test_subdirectory, "dataset2.pbtxt")
     message_helpers.write_message(dataset2, dataset2_filename)
     yield test_subdirectory
diff --git a/ord_schema/templating.py b/ord_schema/templating.py
index 77c27c9e..4a6e6c26 100644
--- a/ord_schema/templating.py
+++ b/ord_schema/templating.py
@@ -113,10 +113,14 @@ def _fill_template(string: str, substitutions: Mapping[str, ord_schema.ScalarTyp
     return reaction
 
 
-def generate_dataset(template_string: str, df: pd.DataFrame, validate: bool = True) -> dataset_pb2.Dataset:
+def generate_dataset(
+    name: str, description: str, template_string: str, df: pd.DataFrame, validate: bool = True
+) -> dataset_pb2.Dataset:
     """Generates a Dataset by enumerating a template reaction.
 
     Args:
+        name: Dataset name.
+        description: Dataset description.
         template_string: The contents of a Reaction pbtxt where placeholder
             values to be replaced are defined between dollar signs. For example,
             a SMILES identifier value could be "$product_smiles$". PLaceholders
@@ -156,4 +160,4 @@ def generate_dataset(template_string: str, df: pd.DataFrame, validate: bool = Tr
                 raise ValueError(f"Enumerated Reaction is not valid: {output.errors}")
         reactions.append(reaction)
 
-    return dataset_pb2.Dataset(reactions=reactions)
+    return dataset_pb2.Dataset(name=name, description=description, reactions=reactions)
diff --git a/ord_schema/templating_test.py b/ord_schema/templating_test.py
index 1a63d463..d293cefb 100644
--- a/ord_schema/templating_test.py
+++ b/ord_schema/templating_test.py
@@ -45,7 +45,7 @@ def test_valid_templating(valid_reaction):
     template_string = template_string.replace('value: "CCO"', 'value: "$smiles$"')
     template_string = template_string.replace("value: 75", "value: $conversion$")
     df = pd.DataFrame.from_dict({"$smiles$": ["CCO", "CCCO", "CCCCO"], "$conversion$": [75, 50, 30]})
-    dataset = templating.generate_dataset(template_string, df)
+    dataset = templating.generate_dataset(name="test", description="test", template_string=template_string, df=df)
     expected_reactions = []
     for smiles, conversion in zip(["CCO", "CCCO", "CCCCO"], [75, 50, 30]):
         reaction = reaction_pb2.Reaction()
@@ -53,12 +53,12 @@ def test_valid_templating(valid_reaction):
         reaction.inputs["in"].components[0].identifiers[0].value = smiles
         reaction.outcomes[0].conversion.value = conversion
         expected_reactions.append(reaction)
-    expected_dataset = dataset_pb2.Dataset(reactions=expected_reactions)
+    expected_dataset = dataset_pb2.Dataset(name="test", description="test", reactions=expected_reactions)
     assert dataset == expected_dataset
 
     # Test without "$" in column names
     df = pd.DataFrame.from_dict({"smiles": ["CCO", "CCCO", "CCCCO"], "conversion": [75, 50, 30]})
-    dataset = templating.generate_dataset(template_string, df)
+    dataset = templating.generate_dataset(name="test", description="test", template_string=template_string, df=df)
     assert dataset == expected_dataset
 
 
@@ -69,14 +69,14 @@ def test_valid_templating_escapes(valid_reaction):
     valid_reaction.inputs["in"].components[0].identifiers.add(type="MOLBLOCK", value="$molblock$")
     template_string = text_format.MessageToString(valid_reaction)
     df = pd.DataFrame.from_dict({"molblock": molblocks})
-    dataset = templating.generate_dataset(template_string, df)
+    dataset = templating.generate_dataset(name="test", description="test", template_string=template_string, df=df)
     expected_reactions = []
     for molblock in molblocks:
         reaction = reaction_pb2.Reaction()
         reaction.CopyFrom(valid_reaction)
         reaction.inputs["in"].components[0].identifiers[1].value = molblock
         expected_reactions.append(reaction)
-    expected_dataset = dataset_pb2.Dataset(reactions=expected_reactions)
+    expected_dataset = dataset_pb2.Dataset(name="test", description="test", reactions=expected_reactions)
     assert dataset == expected_dataset
 
 
@@ -103,10 +103,12 @@ def test_invalid_templating(valid_reaction):
         reaction.inputs["in"].components[0].identifiers[0].value = smiles
         reaction.outcomes[0].conversion.precision = precision
         expected_reactions.append(reaction)
-    expected_dataset = dataset_pb2.Dataset(reactions=expected_reactions)
+    expected_dataset = dataset_pb2.Dataset(name="test", description="test", reactions=expected_reactions)
     with pytest.raises(ValueError, match="Enumerated Reaction is not valid"):
-        templating.generate_dataset(template_string, df)
-    dataset = templating.generate_dataset(template_string, df, validate=False)
+        templating.generate_dataset(name="test", description="test", template_string=template_string, df=df)
+    dataset = templating.generate_dataset(
+        name="test", description="test", template_string=template_string, df=df, validate=False
+    )
     assert dataset == expected_dataset
 
 
@@ -116,7 +118,7 @@ def test_bad_placeholders(valid_reaction):
     template_string = template_string.replace("value: 75", "value: $conversion$")
     df = pd.DataFrame.from_dict({"$my_smiles$": ["CCO", "CCCO", "CCCCO"]})
     with pytest.raises(ValueError, match=r"\$conversion\$ not found"):
-        templating.generate_dataset(template_string, df)
+        templating.generate_dataset(name="test", description="test", template_string=template_string, df=df)
 
 
 def test_missing_values(tmp_path):
@@ -153,8 +155,8 @@ def test_missing_values(tmp_path):
         f.write("CN,\n")  # Missing mass.
         f.write(",1.5\n")  # Missing SMILES.
     df = pd.read_csv(filename)
-    dataset = templating.generate_dataset(template_string, df)
-    expected_dataset = dataset_pb2.Dataset()
+    dataset = templating.generate_dataset(name="test", description="test", template_string=template_string, df=df)
+    expected_dataset = dataset_pb2.Dataset(name="test", description="test")
     reaction1 = expected_dataset.reactions.add()
     reaction1.CopyFrom(reaction)
     reaction1.inputs["one"].components[0].identifiers[0].value = "CN"
diff --git a/ord_schema/validations.py b/ord_schema/validations.py
index b812cabc..87e7783a 100644
--- a/ord_schema/validations.py
+++ b/ord_schema/validations.py
@@ -380,6 +380,10 @@ def validate_dataset(message: dataset_pb2.Dataset, options: Optional[ValidationO
     # pylint: disable=too-many-branches,too-many-nested-blocks
     if options is None:
         options = ValidationOptions()
+    if not message.name:
+        warnings.warn("Dataset name is required", ValidationError)
+    if not message.description:
+        warnings.warn("Dataset description is required", ValidationError)
     if not message.reactions and not message.reaction_ids:
         warnings.warn("Dataset requires reactions or reaction_ids", ValidationError)
     elif message.reactions and message.reaction_ids:
diff --git a/ord_schema/validations_test.py b/ord_schema/validations_test.py
index 73ce7717..e1033a24 100644
--- a/ord_schema/validations_test.py
+++ b/ord_schema/validations_test.py
@@ -495,8 +495,22 @@ def test_data():
     assert len(output.warnings) == 0
 
 
+def test_dataset_missing_name():
+    message = dataset_pb2.Dataset()
+    options = validations.ValidationOptions(validate_ids=True)
+    with pytest.raises(validations.ValidationError, match="name is required"):
+        _run_validation(message, options=options)
+
+
+def test_dataset_missing_name():
+    message = dataset_pb2.Dataset()
+    options = validations.ValidationOptions(validate_ids=True)
+    with pytest.raises(validations.ValidationError, match="name is required"):
+        _run_validation(message, options=options)
+
+
 def test_dataset_bad_reaction_id():
-    message = dataset_pb2.Dataset(reaction_ids=["foo"])
+    message = dataset_pb2.Dataset(name="test", description="test", reaction_ids=["foo"])
     options = validations.ValidationOptions(validate_ids=True)
     with pytest.raises(validations.ValidationError, match="malformed"):
         _run_validation(message, options=options)
@@ -504,6 +518,8 @@ def test_dataset_bad_reaction_id():
 
 def test_dataset_records_and_ids():
     message = dataset_pb2.Dataset(
+        name="test",
+        description="test",
         reactions=[reaction_pb2.Reaction()],
         reaction_ids=["ord-c0bbd41f095a44a78b6221135961d809"],
     )
@@ -513,7 +529,9 @@ def test_dataset_records_and_ids():
 
 
 def test_dataset_bad_id():
-    message = dataset_pb2.Dataset(reactions=[reaction_pb2.Reaction()], dataset_id="foo")
+    message = dataset_pb2.Dataset(
+        name="test", description="test", reactions=[reaction_pb2.Reaction()], dataset_id="foo"
+    )
     options = validations.ValidationOptions(validate_ids=True)
     with pytest.raises(validations.ValidationError, match="malformed"):
         _run_validation(message, recurse=False, options=options)
@@ -537,8 +555,8 @@ def test_dataset_example():
     assert len(output.warnings) == 0
 
 
-def test_dataset_crossreferences():
-    message = dataset_pb2.Dataset()
+def test_dataset_cross_references():
+    message = dataset_pb2.Dataset(name="test", description="test")
     reaction1 = message.reactions.add()
     reaction2 = message.reactions.add()
     reaction3 = message.reactions.add()

From 3dfa8ac49d3d6bf51755de9e05b1b7e7b830fb6d Mon Sep 17 00:00:00 2001
From: Steven Kearnes <skearnes@relaytx.com>
Date: Fri, 12 Jul 2024 10:44:14 -0400
Subject: [PATCH 2/2] lint

---
 ord_schema/scripts/enumerate_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ord_schema/scripts/enumerate_dataset.py b/ord_schema/scripts/enumerate_dataset.py
index 2835fc1a..39b1b414 100644
--- a/ord_schema/scripts/enumerate_dataset.py
+++ b/ord_schema/scripts/enumerate_dataset.py
@@ -14,7 +14,7 @@
 """Creates a Dataset by enumerating a template with a spreadsheet.
 
 Usage:
-    dataset_templating.py --name=<str> --description=<str> --template=<str> --spreadsheet=<str> --output=<str> [--no-validate]
+    dataset_templating.py --name=<str> --description=<str> --template=<str> --spreadsheet=<str> --output=<str> [options]
 
 Options:
     --name=<str>            Dataset name