From c5a2055d60354d8e4cbdeb9ff8a947b4b5b0da04 Mon Sep 17 00:00:00 2001 From: Steven Kearnes Date: Fri, 12 Jul 2024 10:36:54 -0400 Subject: [PATCH 1/2] Make dataset name and description required --- ord_schema/scripts/enumerate_dataset.py | 12 +++++++-- ord_schema/scripts/enumerate_dataset_test.py | 6 ++++- ord_schema/scripts/process_dataset_test.py | 22 +++++++++-------- ord_schema/scripts/validate_dataset_test.py | 4 +-- ord_schema/templating.py | 8 ++++-- ord_schema/templating_test.py | 24 +++++++++--------- ord_schema/validations.py | 4 +++ ord_schema/validations_test.py | 26 +++++++++++++++++--- 8 files changed, 74 insertions(+), 32 deletions(-) diff --git a/ord_schema/scripts/enumerate_dataset.py b/ord_schema/scripts/enumerate_dataset.py index 71c993c4..2835fc1a 100644 --- a/ord_schema/scripts/enumerate_dataset.py +++ b/ord_schema/scripts/enumerate_dataset.py @@ -14,9 +14,11 @@ """Creates a Dataset by enumerating a template with a spreadsheet. Usage: - dataset_templating.py --template= --spreadsheet= --output= [--no-validate] + dataset_templating.py --name= --description= --template= --spreadsheet= --output= [--no-validate] Options: + --name= Dataset name + --description= Dataset description --template= Path to a Reaction pbtxt file defining a template --spreadsheet= Path to a spreadsheet file with a header row matching template placeholders --output= Filename for output Dataset @@ -39,7 +41,13 @@ def main(kwargs): kwargs["--template"], kwargs["--spreadsheet"], ) - dataset = templating.generate_dataset(template_string, df, validate=not kwargs["--no-validate"]) + dataset = templating.generate_dataset( + name=kwargs["--name"], + description=kwargs["--description"], + template_string=template_string, + df=df, + validate=not kwargs["--no-validate"], + ) logger.info("writing new Dataset to %s", kwargs["--output"]) message_helpers.write_message(dataset, kwargs["--output"]) diff --git a/ord_schema/scripts/enumerate_dataset_test.py b/ord_schema/scripts/enumerate_dataset_test.py index d52c5517..276a150b 100644 --- a/ord_schema/scripts/enumerate_dataset_test.py +++ b/ord_schema/scripts/enumerate_dataset_test.py @@ -91,7 +91,7 @@ def setup(tmp_path) -> tuple[str, str, str]: ) spreadsheet_filename = os.path.join(dirname, "spreadsheet.csv") data.to_csv(spreadsheet_filename, index=False) - expected = dataset_pb2.Dataset() + expected = dataset_pb2.Dataset(name="test", description="test") reaction1 = expected.reactions.add() reaction1_compound1 = reaction1.inputs["test"].components.add() reaction1_compound1.identifiers.add(value="C", type="SMILES") @@ -132,6 +132,10 @@ def test_main(setup, tmp_path): template_filename, spreadsheet_filename, expected = setup output_filename = (tmp_path / "dataset.pbtxt").as_posix() argv = [ + "--name", + "test", + "--description", + "test", "--template", template_filename, "--spreadsheet", diff --git a/ord_schema/scripts/process_dataset_test.py b/ord_schema/scripts/process_dataset_test.py index 9eae1d70..75d5efdb 100644 --- a/ord_schema/scripts/process_dataset_test.py +++ b/ord_schema/scripts/process_dataset_test.py @@ -48,6 +48,8 @@ def setup(self, tmp_path) -> tuple[str, str]: reaction1.provenance.record_created.person.username = "test" reaction1.provenance.record_created.person.email = "test@example.com" dataset1 = dataset_pb2.Dataset( + name="test1", + description="test1", dataset_id="ord_dataset-00000000000000000000000000000000", reactions=[reaction1], ) @@ -55,7 +57,7 @@ def setup(self, tmp_path) -> tuple[str, str]: message_helpers.write_message(dataset1, dataset1_filename) # reaction2 is empty. reaction2 = reaction_pb2.Reaction() - dataset2 = dataset_pb2.Dataset(reactions=[reaction1, reaction2]) + dataset2 = dataset_pb2.Dataset(name="test2", description="test2", reactions=[reaction1, reaction2]) dataset2_filename = (tmp_path / "dataset2.pb").as_posix() message_helpers.write_message(dataset2, dataset2_filename) yield dataset1_filename, dataset2_filename @@ -140,7 +142,7 @@ def setup(self, tmp_path) -> tuple[str, str]: reaction.provenance.record_created.person.email = "test@example.com" reaction.reaction_id = "ord-10aed8b5dffe41fab09f5b2cc9c58ad9" dataset_id = "ord_dataset-64b14868c5cd46dd8e75560fd3589a6b" - dataset = dataset_pb2.Dataset(reactions=[reaction], dataset_id=dataset_id) + dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction], dataset_id=dataset_id) # Make sure the initial dataset is valid. validations.validate_message(dataset) os.makedirs(os.path.join("data", "64")) @@ -213,7 +215,7 @@ def test_add_dataset(self, setup): reaction.provenance.record_created.person.username = "test" reaction.provenance.record_created.person.email = "test@example.com" reaction.reaction_id = "test" - dataset = dataset_pb2.Dataset(reactions=[reaction]) + dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction]) this_dataset_filename = os.path.join(test_subdirectory, "test.pbtxt") message_helpers.write_message(dataset, this_dataset_filename) added, removed, changed, filenames = self._run(test_subdirectory) @@ -246,14 +248,14 @@ def test_add_sharded_dataset(self, setup): reaction.provenance.record_created.person.username = "test2" reaction.provenance.record_created.person.email = "test2@example.com" reaction.reaction_id = "test1" - dataset1 = dataset_pb2.Dataset(reactions=[reaction]) + dataset1 = dataset_pb2.Dataset(name="test1", description="test1", reactions=[reaction]) dataset1_filename = os.path.join(test_subdirectory, "test1.pbtxt") message_helpers.write_message(dataset1, dataset1_filename) reaction.provenance.record_created.time.value = "2020-01-03" reaction.provenance.record_created.person.username = "test3" reaction.provenance.record_created.person.email = "test3@example.com" reaction.reaction_id = "test2" - dataset2 = dataset_pb2.Dataset(reactions=[reaction]) + dataset2 = dataset_pb2.Dataset(name="test2", description="test2", reactions=[reaction]) dataset2_filename = os.path.join(test_subdirectory, "test2.pbtxt") message_helpers.write_message(dataset2, dataset2_filename) added, removed, changed, filenames = self._run(test_subdirectory) @@ -284,7 +286,7 @@ def test_add_dataset_with_existing_reaction_ids(self, setup): reaction.provenance.record_created.time.value = "2020-01-01" reaction.provenance.record_created.person.username = "test" reaction.provenance.record_created.person.email = "test@example.com" - dataset = dataset_pb2.Dataset(reactions=[reaction]) + dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction]) this_dataset_filename = os.path.join(test_subdirectory, "test.pbtxt") message_helpers.write_message(dataset, this_dataset_filename) added, removed, changed, filenames = self._run(test_subdirectory) @@ -354,7 +356,7 @@ def test_add_dataset_with_validation_errors(self, setup): component.amount.moles.value = 2 component.amount.moles.units = reaction_pb2.Moles.MILLIMOLE reaction.outcomes.add().conversion.value = 25 - dataset = dataset_pb2.Dataset(reactions=[reaction]) + dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction]) dataset_filename = os.path.join(test_subdirectory, "test.pbtxt") message_helpers.write_message(dataset, dataset_filename) with pytest.raises(validations.ValidationError, match="could not validate SMILES"): @@ -373,11 +375,11 @@ def test_add_sharded_dataset_with_validation_errors(self, setup): reaction.provenance.record_created.time.value = "2021-02-09" reaction.provenance.record_created.person.username = "bob" reaction.provenance.record_created.person.email = "bob@bob.com" - dataset1 = dataset_pb2.Dataset(reactions=[reaction]) + dataset1 = dataset_pb2.Dataset(name="test1", description="test2", reactions=[reaction]) dataset1_filename = os.path.join(test_subdirectory, "test1.pbtxt") message_helpers.write_message(dataset1, dataset1_filename) reaction.inputs["ethylamine"].components[0].identifiers[0].value = "#" - dataset2 = dataset_pb2.Dataset(reactions=[reaction]) + dataset2 = dataset_pb2.Dataset(name="test2", description="test2", reactions=[reaction]) dataset2_filename = os.path.join(test_subdirectory, "test2.pbtxt") message_helpers.write_message(dataset2, dataset2_filename) with pytest.raises(validations.ValidationError, match="could not validate SMILES"): @@ -408,7 +410,7 @@ def test_add_dataset_with_too_large_reaction(self, setup): reaction.provenance.record_created.time.value = "2023-07-01" reaction.provenance.record_created.person.name = "test" reaction.provenance.record_created.person.email = "test@example.com" - dataset = dataset_pb2.Dataset(reactions=[reaction]) + dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction]) dataset_filename = os.path.join(test_subdirectory, "test.pbtxt") message_helpers.write_message(dataset, dataset_filename) with pytest.raises(ValueError, match="larger than --max_size"): diff --git a/ord_schema/scripts/validate_dataset_test.py b/ord_schema/scripts/validate_dataset_test.py index 16231d36..18768fbb 100644 --- a/ord_schema/scripts/validate_dataset_test.py +++ b/ord_schema/scripts/validate_dataset_test.py @@ -38,12 +38,12 @@ def setup(tmp_path) -> str: reaction1.provenance.record_created.time.value = "2023-07-01" reaction1.provenance.record_created.person.name = "test" reaction1.provenance.record_created.person.email = "test@example.com" - dataset1 = dataset_pb2.Dataset(reactions=[reaction1]) + dataset1 = dataset_pb2.Dataset(name="test1", description="test1", reactions=[reaction1]) dataset1_filename = os.path.join(test_subdirectory, "dataset1.pbtxt") message_helpers.write_message(dataset1, dataset1_filename) # reaction2 is empty. reaction2 = reaction_pb2.Reaction() - dataset2 = dataset_pb2.Dataset(reactions=[reaction1, reaction2]) + dataset2 = dataset_pb2.Dataset(name="test2", description="test2", reactions=[reaction1, reaction2]) dataset2_filename = os.path.join(test_subdirectory, "dataset2.pbtxt") message_helpers.write_message(dataset2, dataset2_filename) yield test_subdirectory diff --git a/ord_schema/templating.py b/ord_schema/templating.py index 77c27c9e..4a6e6c26 100644 --- a/ord_schema/templating.py +++ b/ord_schema/templating.py @@ -113,10 +113,14 @@ def _fill_template(string: str, substitutions: Mapping[str, ord_schema.ScalarTyp return reaction -def generate_dataset(template_string: str, df: pd.DataFrame, validate: bool = True) -> dataset_pb2.Dataset: +def generate_dataset( + name: str, description: str, template_string: str, df: pd.DataFrame, validate: bool = True +) -> dataset_pb2.Dataset: """Generates a Dataset by enumerating a template reaction. Args: + name: Dataset name. + description: Dataset description. template_string: The contents of a Reaction pbtxt where placeholder values to be replaced are defined between dollar signs. For example, a SMILES identifier value could be "$product_smiles$". PLaceholders @@ -156,4 +160,4 @@ def generate_dataset(template_string: str, df: pd.DataFrame, validate: bool = Tr raise ValueError(f"Enumerated Reaction is not valid: {output.errors}") reactions.append(reaction) - return dataset_pb2.Dataset(reactions=reactions) + return dataset_pb2.Dataset(name=name, description=description, reactions=reactions) diff --git a/ord_schema/templating_test.py b/ord_schema/templating_test.py index 1a63d463..d293cefb 100644 --- a/ord_schema/templating_test.py +++ b/ord_schema/templating_test.py @@ -45,7 +45,7 @@ def test_valid_templating(valid_reaction): template_string = template_string.replace('value: "CCO"', 'value: "$smiles$"') template_string = template_string.replace("value: 75", "value: $conversion$") df = pd.DataFrame.from_dict({"$smiles$": ["CCO", "CCCO", "CCCCO"], "$conversion$": [75, 50, 30]}) - dataset = templating.generate_dataset(template_string, df) + dataset = templating.generate_dataset(name="test", description="test", template_string=template_string, df=df) expected_reactions = [] for smiles, conversion in zip(["CCO", "CCCO", "CCCCO"], [75, 50, 30]): reaction = reaction_pb2.Reaction() @@ -53,12 +53,12 @@ def test_valid_templating(valid_reaction): reaction.inputs["in"].components[0].identifiers[0].value = smiles reaction.outcomes[0].conversion.value = conversion expected_reactions.append(reaction) - expected_dataset = dataset_pb2.Dataset(reactions=expected_reactions) + expected_dataset = dataset_pb2.Dataset(name="test", description="test", reactions=expected_reactions) assert dataset == expected_dataset # Test without "$" in column names df = pd.DataFrame.from_dict({"smiles": ["CCO", "CCCO", "CCCCO"], "conversion": [75, 50, 30]}) - dataset = templating.generate_dataset(template_string, df) + dataset = templating.generate_dataset(name="test", description="test", template_string=template_string, df=df) assert dataset == expected_dataset @@ -69,14 +69,14 @@ def test_valid_templating_escapes(valid_reaction): valid_reaction.inputs["in"].components[0].identifiers.add(type="MOLBLOCK", value="$molblock$") template_string = text_format.MessageToString(valid_reaction) df = pd.DataFrame.from_dict({"molblock": molblocks}) - dataset = templating.generate_dataset(template_string, df) + dataset = templating.generate_dataset(name="test", description="test", template_string=template_string, df=df) expected_reactions = [] for molblock in molblocks: reaction = reaction_pb2.Reaction() reaction.CopyFrom(valid_reaction) reaction.inputs["in"].components[0].identifiers[1].value = molblock expected_reactions.append(reaction) - expected_dataset = dataset_pb2.Dataset(reactions=expected_reactions) + expected_dataset = dataset_pb2.Dataset(name="test", description="test", reactions=expected_reactions) assert dataset == expected_dataset @@ -103,10 +103,12 @@ def test_invalid_templating(valid_reaction): reaction.inputs["in"].components[0].identifiers[0].value = smiles reaction.outcomes[0].conversion.precision = precision expected_reactions.append(reaction) - expected_dataset = dataset_pb2.Dataset(reactions=expected_reactions) + expected_dataset = dataset_pb2.Dataset(name="test", description="test", reactions=expected_reactions) with pytest.raises(ValueError, match="Enumerated Reaction is not valid"): - templating.generate_dataset(template_string, df) - dataset = templating.generate_dataset(template_string, df, validate=False) + templating.generate_dataset(name="test", description="test", template_string=template_string, df=df) + dataset = templating.generate_dataset( + name="test", description="test", template_string=template_string, df=df, validate=False + ) assert dataset == expected_dataset @@ -116,7 +118,7 @@ def test_bad_placeholders(valid_reaction): template_string = template_string.replace("value: 75", "value: $conversion$") df = pd.DataFrame.from_dict({"$my_smiles$": ["CCO", "CCCO", "CCCCO"]}) with pytest.raises(ValueError, match=r"\$conversion\$ not found"): - templating.generate_dataset(template_string, df) + templating.generate_dataset(name="test", description="test", template_string=template_string, df=df) def test_missing_values(tmp_path): @@ -153,8 +155,8 @@ def test_missing_values(tmp_path): f.write("CN,\n") # Missing mass. f.write(",1.5\n") # Missing SMILES. df = pd.read_csv(filename) - dataset = templating.generate_dataset(template_string, df) - expected_dataset = dataset_pb2.Dataset() + dataset = templating.generate_dataset(name="test", description="test", template_string=template_string, df=df) + expected_dataset = dataset_pb2.Dataset(name="test", description="test") reaction1 = expected_dataset.reactions.add() reaction1.CopyFrom(reaction) reaction1.inputs["one"].components[0].identifiers[0].value = "CN" diff --git a/ord_schema/validations.py b/ord_schema/validations.py index b812cabc..87e7783a 100644 --- a/ord_schema/validations.py +++ b/ord_schema/validations.py @@ -380,6 +380,10 @@ def validate_dataset(message: dataset_pb2.Dataset, options: Optional[ValidationO # pylint: disable=too-many-branches,too-many-nested-blocks if options is None: options = ValidationOptions() + if not message.name: + warnings.warn("Dataset name is required", ValidationError) + if not message.description: + warnings.warn("Dataset description is required", ValidationError) if not message.reactions and not message.reaction_ids: warnings.warn("Dataset requires reactions or reaction_ids", ValidationError) elif message.reactions and message.reaction_ids: diff --git a/ord_schema/validations_test.py b/ord_schema/validations_test.py index 73ce7717..e1033a24 100644 --- a/ord_schema/validations_test.py +++ b/ord_schema/validations_test.py @@ -495,8 +495,22 @@ def test_data(): assert len(output.warnings) == 0 +def test_dataset_missing_name(): + message = dataset_pb2.Dataset() + options = validations.ValidationOptions(validate_ids=True) + with pytest.raises(validations.ValidationError, match="name is required"): + _run_validation(message, options=options) + + +def test_dataset_missing_name(): + message = dataset_pb2.Dataset() + options = validations.ValidationOptions(validate_ids=True) + with pytest.raises(validations.ValidationError, match="name is required"): + _run_validation(message, options=options) + + def test_dataset_bad_reaction_id(): - message = dataset_pb2.Dataset(reaction_ids=["foo"]) + message = dataset_pb2.Dataset(name="test", description="test", reaction_ids=["foo"]) options = validations.ValidationOptions(validate_ids=True) with pytest.raises(validations.ValidationError, match="malformed"): _run_validation(message, options=options) @@ -504,6 +518,8 @@ def test_dataset_bad_reaction_id(): def test_dataset_records_and_ids(): message = dataset_pb2.Dataset( + name="test", + description="test", reactions=[reaction_pb2.Reaction()], reaction_ids=["ord-c0bbd41f095a44a78b6221135961d809"], ) @@ -513,7 +529,9 @@ def test_dataset_records_and_ids(): def test_dataset_bad_id(): - message = dataset_pb2.Dataset(reactions=[reaction_pb2.Reaction()], dataset_id="foo") + message = dataset_pb2.Dataset( + name="test", description="test", reactions=[reaction_pb2.Reaction()], dataset_id="foo" + ) options = validations.ValidationOptions(validate_ids=True) with pytest.raises(validations.ValidationError, match="malformed"): _run_validation(message, recurse=False, options=options) @@ -537,8 +555,8 @@ def test_dataset_example(): assert len(output.warnings) == 0 -def test_dataset_crossreferences(): - message = dataset_pb2.Dataset() +def test_dataset_cross_references(): + message = dataset_pb2.Dataset(name="test", description="test") reaction1 = message.reactions.add() reaction2 = message.reactions.add() reaction3 = message.reactions.add() From 3dfa8ac49d3d6bf51755de9e05b1b7e7b830fb6d Mon Sep 17 00:00:00 2001 From: Steven Kearnes Date: Fri, 12 Jul 2024 10:44:14 -0400 Subject: [PATCH 2/2] lint --- ord_schema/scripts/enumerate_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ord_schema/scripts/enumerate_dataset.py b/ord_schema/scripts/enumerate_dataset.py index 2835fc1a..39b1b414 100644 --- a/ord_schema/scripts/enumerate_dataset.py +++ b/ord_schema/scripts/enumerate_dataset.py @@ -14,7 +14,7 @@ """Creates a Dataset by enumerating a template with a spreadsheet. Usage: - dataset_templating.py --name= --description= --template= --spreadsheet= --output= [--no-validate] + dataset_templating.py --name= --description= --template= --spreadsheet= --output= [options] Options: --name= Dataset name