Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Require dataset name and description #737

Merged
merged 2 commits into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions ord_schema/scripts/enumerate_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@
"""Creates a Dataset by enumerating a template with a spreadsheet.

Usage:
dataset_templating.py --template=<str> --spreadsheet=<str> --output=<str> [--no-validate]
dataset_templating.py --name=<str> --description=<str> --template=<str> --spreadsheet=<str> --output=<str> [options]

Options:
--name=<str> Dataset name
--description=<str> Dataset description
--template=<str> Path to a Reaction pbtxt file defining a template
--spreadsheet=<str> Path to a spreadsheet file with a header row matching template placeholders
--output=<str> Filename for output Dataset
Expand All @@ -39,7 +41,13 @@ def main(kwargs):
kwargs["--template"],
kwargs["--spreadsheet"],
)
dataset = templating.generate_dataset(template_string, df, validate=not kwargs["--no-validate"])
dataset = templating.generate_dataset(
name=kwargs["--name"],
description=kwargs["--description"],
template_string=template_string,
df=df,
validate=not kwargs["--no-validate"],
)
logger.info("writing new Dataset to %s", kwargs["--output"])
message_helpers.write_message(dataset, kwargs["--output"])

Expand Down
6 changes: 5 additions & 1 deletion ord_schema/scripts/enumerate_dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def setup(tmp_path) -> tuple[str, str, str]:
)
spreadsheet_filename = os.path.join(dirname, "spreadsheet.csv")
data.to_csv(spreadsheet_filename, index=False)
expected = dataset_pb2.Dataset()
expected = dataset_pb2.Dataset(name="test", description="test")
reaction1 = expected.reactions.add()
reaction1_compound1 = reaction1.inputs["test"].components.add()
reaction1_compound1.identifiers.add(value="C", type="SMILES")
Expand Down Expand Up @@ -132,6 +132,10 @@ def test_main(setup, tmp_path):
template_filename, spreadsheet_filename, expected = setup
output_filename = (tmp_path / "dataset.pbtxt").as_posix()
argv = [
"--name",
"test",
"--description",
"test",
"--template",
template_filename,
"--spreadsheet",
Expand Down
22 changes: 12 additions & 10 deletions ord_schema/scripts/process_dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,16 @@ def setup(self, tmp_path) -> tuple[str, str]:
reaction1.provenance.record_created.person.username = "test"
reaction1.provenance.record_created.person.email = "[email protected]"
dataset1 = dataset_pb2.Dataset(
name="test1",
description="test1",
dataset_id="ord_dataset-00000000000000000000000000000000",
reactions=[reaction1],
)
dataset1_filename = (tmp_path / "dataset1.pbtxt").as_posix()
message_helpers.write_message(dataset1, dataset1_filename)
# reaction2 is empty.
reaction2 = reaction_pb2.Reaction()
dataset2 = dataset_pb2.Dataset(reactions=[reaction1, reaction2])
dataset2 = dataset_pb2.Dataset(name="test2", description="test2", reactions=[reaction1, reaction2])
dataset2_filename = (tmp_path / "dataset2.pb").as_posix()
message_helpers.write_message(dataset2, dataset2_filename)
yield dataset1_filename, dataset2_filename
Expand Down Expand Up @@ -140,7 +142,7 @@ def setup(self, tmp_path) -> tuple[str, str]:
reaction.provenance.record_created.person.email = "[email protected]"
reaction.reaction_id = "ord-10aed8b5dffe41fab09f5b2cc9c58ad9"
dataset_id = "ord_dataset-64b14868c5cd46dd8e75560fd3589a6b"
dataset = dataset_pb2.Dataset(reactions=[reaction], dataset_id=dataset_id)
dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction], dataset_id=dataset_id)
# Make sure the initial dataset is valid.
validations.validate_message(dataset)
os.makedirs(os.path.join("data", "64"))
Expand Down Expand Up @@ -213,7 +215,7 @@ def test_add_dataset(self, setup):
reaction.provenance.record_created.person.username = "test"
reaction.provenance.record_created.person.email = "[email protected]"
reaction.reaction_id = "test"
dataset = dataset_pb2.Dataset(reactions=[reaction])
dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction])
this_dataset_filename = os.path.join(test_subdirectory, "test.pbtxt")
message_helpers.write_message(dataset, this_dataset_filename)
added, removed, changed, filenames = self._run(test_subdirectory)
Expand Down Expand Up @@ -246,14 +248,14 @@ def test_add_sharded_dataset(self, setup):
reaction.provenance.record_created.person.username = "test2"
reaction.provenance.record_created.person.email = "[email protected]"
reaction.reaction_id = "test1"
dataset1 = dataset_pb2.Dataset(reactions=[reaction])
dataset1 = dataset_pb2.Dataset(name="test1", description="test1", reactions=[reaction])
dataset1_filename = os.path.join(test_subdirectory, "test1.pbtxt")
message_helpers.write_message(dataset1, dataset1_filename)
reaction.provenance.record_created.time.value = "2020-01-03"
reaction.provenance.record_created.person.username = "test3"
reaction.provenance.record_created.person.email = "[email protected]"
reaction.reaction_id = "test2"
dataset2 = dataset_pb2.Dataset(reactions=[reaction])
dataset2 = dataset_pb2.Dataset(name="test2", description="test2", reactions=[reaction])
dataset2_filename = os.path.join(test_subdirectory, "test2.pbtxt")
message_helpers.write_message(dataset2, dataset2_filename)
added, removed, changed, filenames = self._run(test_subdirectory)
Expand Down Expand Up @@ -284,7 +286,7 @@ def test_add_dataset_with_existing_reaction_ids(self, setup):
reaction.provenance.record_created.time.value = "2020-01-01"
reaction.provenance.record_created.person.username = "test"
reaction.provenance.record_created.person.email = "[email protected]"
dataset = dataset_pb2.Dataset(reactions=[reaction])
dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction])
this_dataset_filename = os.path.join(test_subdirectory, "test.pbtxt")
message_helpers.write_message(dataset, this_dataset_filename)
added, removed, changed, filenames = self._run(test_subdirectory)
Expand Down Expand Up @@ -354,7 +356,7 @@ def test_add_dataset_with_validation_errors(self, setup):
component.amount.moles.value = 2
component.amount.moles.units = reaction_pb2.Moles.MILLIMOLE
reaction.outcomes.add().conversion.value = 25
dataset = dataset_pb2.Dataset(reactions=[reaction])
dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction])
dataset_filename = os.path.join(test_subdirectory, "test.pbtxt")
message_helpers.write_message(dataset, dataset_filename)
with pytest.raises(validations.ValidationError, match="could not validate SMILES"):
Expand All @@ -373,11 +375,11 @@ def test_add_sharded_dataset_with_validation_errors(self, setup):
reaction.provenance.record_created.time.value = "2021-02-09"
reaction.provenance.record_created.person.username = "bob"
reaction.provenance.record_created.person.email = "[email protected]"
dataset1 = dataset_pb2.Dataset(reactions=[reaction])
dataset1 = dataset_pb2.Dataset(name="test1", description="test2", reactions=[reaction])
dataset1_filename = os.path.join(test_subdirectory, "test1.pbtxt")
message_helpers.write_message(dataset1, dataset1_filename)
reaction.inputs["ethylamine"].components[0].identifiers[0].value = "#"
dataset2 = dataset_pb2.Dataset(reactions=[reaction])
dataset2 = dataset_pb2.Dataset(name="test2", description="test2", reactions=[reaction])
dataset2_filename = os.path.join(test_subdirectory, "test2.pbtxt")
message_helpers.write_message(dataset2, dataset2_filename)
with pytest.raises(validations.ValidationError, match="could not validate SMILES"):
Expand Down Expand Up @@ -408,7 +410,7 @@ def test_add_dataset_with_too_large_reaction(self, setup):
reaction.provenance.record_created.time.value = "2023-07-01"
reaction.provenance.record_created.person.name = "test"
reaction.provenance.record_created.person.email = "[email protected]"
dataset = dataset_pb2.Dataset(reactions=[reaction])
dataset = dataset_pb2.Dataset(name="test", description="test", reactions=[reaction])
dataset_filename = os.path.join(test_subdirectory, "test.pbtxt")
message_helpers.write_message(dataset, dataset_filename)
with pytest.raises(ValueError, match="larger than --max_size"):
Expand Down
4 changes: 2 additions & 2 deletions ord_schema/scripts/validate_dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ def setup(tmp_path) -> str:
reaction1.provenance.record_created.time.value = "2023-07-01"
reaction1.provenance.record_created.person.name = "test"
reaction1.provenance.record_created.person.email = "[email protected]"
dataset1 = dataset_pb2.Dataset(reactions=[reaction1])
dataset1 = dataset_pb2.Dataset(name="test1", description="test1", reactions=[reaction1])
dataset1_filename = os.path.join(test_subdirectory, "dataset1.pbtxt")
message_helpers.write_message(dataset1, dataset1_filename)
# reaction2 is empty.
reaction2 = reaction_pb2.Reaction()
dataset2 = dataset_pb2.Dataset(reactions=[reaction1, reaction2])
dataset2 = dataset_pb2.Dataset(name="test2", description="test2", reactions=[reaction1, reaction2])
dataset2_filename = os.path.join(test_subdirectory, "dataset2.pbtxt")
message_helpers.write_message(dataset2, dataset2_filename)
yield test_subdirectory
Expand Down
8 changes: 6 additions & 2 deletions ord_schema/templating.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,14 @@ def _fill_template(string: str, substitutions: Mapping[str, ord_schema.ScalarTyp
return reaction


def generate_dataset(template_string: str, df: pd.DataFrame, validate: bool = True) -> dataset_pb2.Dataset:
def generate_dataset(
name: str, description: str, template_string: str, df: pd.DataFrame, validate: bool = True
) -> dataset_pb2.Dataset:
"""Generates a Dataset by enumerating a template reaction.

Args:
name: Dataset name.
description: Dataset description.
template_string: The contents of a Reaction pbtxt where placeholder
values to be replaced are defined between dollar signs. For example,
a SMILES identifier value could be "$product_smiles$". PLaceholders
Expand Down Expand Up @@ -156,4 +160,4 @@ def generate_dataset(template_string: str, df: pd.DataFrame, validate: bool = Tr
raise ValueError(f"Enumerated Reaction is not valid: {output.errors}")
reactions.append(reaction)

return dataset_pb2.Dataset(reactions=reactions)
return dataset_pb2.Dataset(name=name, description=description, reactions=reactions)
24 changes: 13 additions & 11 deletions ord_schema/templating_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,20 @@ def test_valid_templating(valid_reaction):
template_string = template_string.replace('value: "CCO"', 'value: "$smiles$"')
template_string = template_string.replace("value: 75", "value: $conversion$")
df = pd.DataFrame.from_dict({"$smiles$": ["CCO", "CCCO", "CCCCO"], "$conversion$": [75, 50, 30]})
dataset = templating.generate_dataset(template_string, df)
dataset = templating.generate_dataset(name="test", description="test", template_string=template_string, df=df)
expected_reactions = []
for smiles, conversion in zip(["CCO", "CCCO", "CCCCO"], [75, 50, 30]):
reaction = reaction_pb2.Reaction()
reaction.CopyFrom(valid_reaction)
reaction.inputs["in"].components[0].identifiers[0].value = smiles
reaction.outcomes[0].conversion.value = conversion
expected_reactions.append(reaction)
expected_dataset = dataset_pb2.Dataset(reactions=expected_reactions)
expected_dataset = dataset_pb2.Dataset(name="test", description="test", reactions=expected_reactions)
assert dataset == expected_dataset

# Test without "$" in column names
df = pd.DataFrame.from_dict({"smiles": ["CCO", "CCCO", "CCCCO"], "conversion": [75, 50, 30]})
dataset = templating.generate_dataset(template_string, df)
dataset = templating.generate_dataset(name="test", description="test", template_string=template_string, df=df)
assert dataset == expected_dataset


Expand All @@ -69,14 +69,14 @@ def test_valid_templating_escapes(valid_reaction):
valid_reaction.inputs["in"].components[0].identifiers.add(type="MOLBLOCK", value="$molblock$")
template_string = text_format.MessageToString(valid_reaction)
df = pd.DataFrame.from_dict({"molblock": molblocks})
dataset = templating.generate_dataset(template_string, df)
dataset = templating.generate_dataset(name="test", description="test", template_string=template_string, df=df)
expected_reactions = []
for molblock in molblocks:
reaction = reaction_pb2.Reaction()
reaction.CopyFrom(valid_reaction)
reaction.inputs["in"].components[0].identifiers[1].value = molblock
expected_reactions.append(reaction)
expected_dataset = dataset_pb2.Dataset(reactions=expected_reactions)
expected_dataset = dataset_pb2.Dataset(name="test", description="test", reactions=expected_reactions)
assert dataset == expected_dataset


Expand All @@ -103,10 +103,12 @@ def test_invalid_templating(valid_reaction):
reaction.inputs["in"].components[0].identifiers[0].value = smiles
reaction.outcomes[0].conversion.precision = precision
expected_reactions.append(reaction)
expected_dataset = dataset_pb2.Dataset(reactions=expected_reactions)
expected_dataset = dataset_pb2.Dataset(name="test", description="test", reactions=expected_reactions)
with pytest.raises(ValueError, match="Enumerated Reaction is not valid"):
templating.generate_dataset(template_string, df)
dataset = templating.generate_dataset(template_string, df, validate=False)
templating.generate_dataset(name="test", description="test", template_string=template_string, df=df)
dataset = templating.generate_dataset(
name="test", description="test", template_string=template_string, df=df, validate=False
)
assert dataset == expected_dataset


Expand All @@ -116,7 +118,7 @@ def test_bad_placeholders(valid_reaction):
template_string = template_string.replace("value: 75", "value: $conversion$")
df = pd.DataFrame.from_dict({"$my_smiles$": ["CCO", "CCCO", "CCCCO"]})
with pytest.raises(ValueError, match=r"\$conversion\$ not found"):
templating.generate_dataset(template_string, df)
templating.generate_dataset(name="test", description="test", template_string=template_string, df=df)


def test_missing_values(tmp_path):
Expand Down Expand Up @@ -153,8 +155,8 @@ def test_missing_values(tmp_path):
f.write("CN,\n") # Missing mass.
f.write(",1.5\n") # Missing SMILES.
df = pd.read_csv(filename)
dataset = templating.generate_dataset(template_string, df)
expected_dataset = dataset_pb2.Dataset()
dataset = templating.generate_dataset(name="test", description="test", template_string=template_string, df=df)
expected_dataset = dataset_pb2.Dataset(name="test", description="test")
reaction1 = expected_dataset.reactions.add()
reaction1.CopyFrom(reaction)
reaction1.inputs["one"].components[0].identifiers[0].value = "CN"
Expand Down
4 changes: 4 additions & 0 deletions ord_schema/validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,10 @@ def validate_dataset(message: dataset_pb2.Dataset, options: Optional[ValidationO
# pylint: disable=too-many-branches,too-many-nested-blocks
if options is None:
options = ValidationOptions()
if not message.name:
warnings.warn("Dataset name is required", ValidationError)
if not message.description:
warnings.warn("Dataset description is required", ValidationError)
if not message.reactions and not message.reaction_ids:
warnings.warn("Dataset requires reactions or reaction_ids", ValidationError)
elif message.reactions and message.reaction_ids:
Expand Down
26 changes: 22 additions & 4 deletions ord_schema/validations_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,15 +495,31 @@ def test_data():
assert len(output.warnings) == 0


def test_dataset_missing_name():
message = dataset_pb2.Dataset()
options = validations.ValidationOptions(validate_ids=True)
with pytest.raises(validations.ValidationError, match="name is required"):
_run_validation(message, options=options)


def test_dataset_missing_name():
message = dataset_pb2.Dataset()
options = validations.ValidationOptions(validate_ids=True)
with pytest.raises(validations.ValidationError, match="name is required"):
_run_validation(message, options=options)


def test_dataset_bad_reaction_id():
message = dataset_pb2.Dataset(reaction_ids=["foo"])
message = dataset_pb2.Dataset(name="test", description="test", reaction_ids=["foo"])
options = validations.ValidationOptions(validate_ids=True)
with pytest.raises(validations.ValidationError, match="malformed"):
_run_validation(message, options=options)


def test_dataset_records_and_ids():
message = dataset_pb2.Dataset(
name="test",
description="test",
reactions=[reaction_pb2.Reaction()],
reaction_ids=["ord-c0bbd41f095a44a78b6221135961d809"],
)
Expand All @@ -513,7 +529,9 @@ def test_dataset_records_and_ids():


def test_dataset_bad_id():
message = dataset_pb2.Dataset(reactions=[reaction_pb2.Reaction()], dataset_id="foo")
message = dataset_pb2.Dataset(
name="test", description="test", reactions=[reaction_pb2.Reaction()], dataset_id="foo"
)
options = validations.ValidationOptions(validate_ids=True)
with pytest.raises(validations.ValidationError, match="malformed"):
_run_validation(message, recurse=False, options=options)
Expand All @@ -537,8 +555,8 @@ def test_dataset_example():
assert len(output.warnings) == 0


def test_dataset_crossreferences():
message = dataset_pb2.Dataset()
def test_dataset_cross_references():
message = dataset_pb2.Dataset(name="test", description="test")
reaction1 = message.reactions.add()
reaction2 = message.reactions.add()
reaction3 = message.reactions.add()
Expand Down
Loading