diff --git a/api/tests/integration/ref/formats/helm_to_ket.py.out b/api/tests/integration/ref/formats/helm_to_ket.py.out index f9ffc07b3e..fc83880bf8 100644 --- a/api/tests/integration/ref/formats/helm_to_ket.py.out +++ b/api/tests/integration/ref/formats/helm_to_ket.py.out @@ -9,6 +9,7 @@ helm_multi_char_rna.ket:SUCCEED helm_peptide.ket:SUCCEED helm_rna_without_base.ket:SUCCEED helm_simple_rna.ket:SUCCEED +helm_smiles.ket:SUCCEED Test 'CHEM1{[A6OH]}|PEPTIDE1{A}$CHEM1,PEPTIDE1,1:R2-3:R1$$$V2.0': got expected error 'Polymer 'PEPTIDE1' does not contains monomer with number 3.' Test 'CHEM1{[A6OH]}|PEPTIDE1{A}$CHEM10,PEPTIDE1,1:R2-1:R1$$$V2.0': got expected error 'Polymer 'CHEM10' not found.' Test 'CHEM1{[MCC]}|RNA1{R(A)P.R(C)P.R(G)P.R(T)P.R(U)P}$RNA1,PEPTIDE1,15:R2-1:R1$$$V2.0': got expected error 'Polymer 'PEPTIDE1' not found.' diff --git a/api/tests/integration/ref/formats/ket_to_helm.py.out b/api/tests/integration/ref/formats/ket_to_helm.py.out index e873ec7931..17f90af745 100644 --- a/api/tests/integration/ref/formats/ket_to_helm.py.out +++ b/api/tests/integration/ref/formats/ket_to_helm.py.out @@ -8,6 +8,7 @@ helm_connetion_separator.ket:SUCCEED helm_cycled_polymer.ket:SUCCEED helm_mixed_base.ket:SUCCEED helm_mixed_custom.ket:SUCCEED +helm_monomer_molecule.ket:SUCCEED helm_multi_char_rna.ket:SUCCEED helm_peptide.ket:SUCCEED helm_rna_without_base.ket:SUCCEED diff --git a/api/tests/integration/tests/formats/helm_to_ket.py b/api/tests/integration/tests/formats/helm_to_ket.py index b484cc3938..21c2c3679d 100644 --- a/api/tests/integration/tests/formats/helm_to_ket.py +++ b/api/tests/integration/tests/formats/helm_to_ket.py @@ -39,6 +39,7 @@ def find_diff(a, b): "helm_mixed_base": "RNA1{[dR](A)P.[dR](A+G)P.[dR](A)P.[dR](G+C)}$$$$V2.0", "helm_mixed_custom": "RNA1{[dR](A:10+C:20+G:30+T:50)P.[dR](A:10+C:20+G:30+T:50)P.[dR](A+C+G+T)}$$$$V2.0", "aminoacids_variants": "PEPTIDE1{(D+N).(L+I).(E+Q).(A+C+D+E+F+G+H+I+K+L+M+N+O+P+Q+R+S+T+U+V+W+Y)}$$$$V2.0", + "helm_smiles": "PEPTIDE1{G.[[*]N[C@@H](C=O)C([*])=O |$_R1;;;;;;_R2;$|].C}|PEPTIDE2{G.[[*:1]N[C@@H](C=O)C([*:2])=O].C}$$$$", } lib = indigo.loadMonomerLibraryFromFile( diff --git a/api/tests/integration/tests/formats/ket_to_helm.py b/api/tests/integration/tests/formats/ket_to_helm.py index da1c8c5523..571feb86e9 100644 --- a/api/tests/integration/tests/formats/ket_to_helm.py +++ b/api/tests/integration/tests/formats/ket_to_helm.py @@ -47,6 +47,7 @@ def find_diff(a, b): "aminoacids_variants": "PEPTIDE1{(D+N).(L+I).(E+Q).(A+C+D+E+F+G+H+I+K+L+M+N+O+P+Q+R+S+T+U+V+W+Y)}$$$$V2.0", "dna_variants": "RNA1{[dR](C+G+T)P.[dR](A+C+G+T)}$$$$V2.0", "rna_variants": "RNA1{R(G+T)P.R(A+C+G+T)}$$$$V2.0", + "helm_monomer_molecule": "PEPTIDE1{A}|CHEM1{[C(N%91)=C%92.[*:1]%92.[*:2]%91 |$;;;_R1;_R2$|}$CHEM1,PEPTIDE1,1:R2-1:R1$$$V2.0", } for filename in sorted(helm_data.keys()): diff --git a/api/tests/integration/tests/formats/ref/helm_monomer_molecule.ket b/api/tests/integration/tests/formats/ref/helm_monomer_molecule.ket new file mode 100644 index 0000000000..5510e1f8fc --- /dev/null +++ b/api/tests/integration/tests/formats/ref/helm_monomer_molecule.ket @@ -0,0 +1,275 @@ +{ + "root": { + "nodes": [ + { + "$ref": "monomer2" + }, + { + "$ref": "mol0" + } + ], + "connections": [ + { + "connectionType": "single", + "endpoint1": { + "moleculeId": "mol0", + "atomId": "3" + }, + "endpoint2": { + "monomerId": "monomer2", + "attachmentPointId": "R1" + } + } + ], + "templates": [ + { + "$ref": "monomerTemplate-A___Alanine" + } + ] + }, + "mol0": { + "type": "molecule", + "atoms": [ + { + "label": "C", + "location": [ + 15.525000000000002, + -7.1499999999999995, + 0 + ] + }, + { + "label": "C", + "location": [ + 16.39102540378444, + -6.649999999999999, + 0 + ] + }, + { + "label": "H", + "location": [ + 17.25705080756888, + -7.150000000000001, + 0 + ] + }, + { + "label": "N", + "location": [ + 14.658974596215565, + -6.649999999999999, + 0 + ] + }, + { + "label": "H", + "location": [ + 13.792949192431125, + -7.1499999999999995, + 0 + ] + } + ], + "bonds": [ + { + "type": 2, + "atoms": [ + 0, + 1 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 3 + ] + }, + { + "type": 1, + "atoms": [ + 3, + 4 + ] + } + ], + "sgroups": [ + { + "type": "SUP", + "atoms": [ + 1, + 0, + 2, + 3, + 4 + ], + "name": "", + "expanded": true, + "id": 0, + "attachmentPoints": [ + { + "attachmentAtom": 1, + "leavingAtom": 2, + "attachmentId": "1" + }, + { + "attachmentAtom": 3, + "leavingAtom": 4, + "attachmentId": "2" + } + ] + } + ] + }, + "monomer2": { + "type": "monomer", + "id": "2", + "position": { + "x": 17.88125, + "y": -6.8812500000000005 + }, + "alias": "A", + "templateId": "A___Alanine" + }, + "monomerTemplate-A___Alanine": { + "type": "monomerTemplate", + "atoms": [ + { + "label": "N", + "location": [ + -0.9805331061317907, + -0.3062945076130863, + 0 + ] + }, + { + "label": "C", + "location": [ + -0.21253088283357008, + 0.20573302003705513, + 0 + ], + "stereoLabel": "abs" + }, + { + "label": "C", + "location": [ + -0.24245710640903237, + 1.3590256048251048, + 0 + ] + }, + { + "label": "C", + "location": [ + 0.8222288529623742, + -0.3062945076130863, + 0 + ] + }, + { + "label": "O", + "location": [ + 0.8461385772811508, + -1.2284597573196283, + 0 + ] + }, + { + "label": "O", + "location": [ + 1.5903092126145777, + 0.20573302003705513, + 0 + ] + }, + { + "label": "H", + "location": [ + -1.8232336838376928, + 0.07071340035455181, + 0 + ] + } + ], + "bonds": [ + { + "type": 1, + "atoms": [ + 1, + 0 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 2 + ], + "stereo": 1 + }, + { + "type": 1, + "atoms": [ + 1, + 3 + ] + }, + { + "type": 2, + "atoms": [ + 3, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 3, + 5 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 6 + ] + } + ], + "class": "AminoAcid", + "classHELM": "PEPTIDE", + "id": "A___Alanine", + "fullName": "Alanine", + "alias": "A", + "attachmentPoints": [ + { + "attachmentAtom": 0, + "leavingGroup": { + "atoms": [ + 6 + ] + }, + "type": "left" + }, + { + "attachmentAtom": 3, + "leavingGroup": { + "atoms": [ + 5 + ] + }, + "type": "right" + } + ], + "naturalAnalogShort": "A" + } +} \ No newline at end of file diff --git a/api/tests/integration/tests/formats/ref/helm_smiles.ket b/api/tests/integration/tests/formats/ref/helm_smiles.ket new file mode 100644 index 0000000000..20fe2239bd --- /dev/null +++ b/api/tests/integration/tests/formats/ref/helm_smiles.ket @@ -0,0 +1,724 @@ +{ + "root": { + "nodes": [ + { + "$ref": "monomer0" + }, + { + "$ref": "monomer1" + }, + { + "$ref": "monomer2" + }, + { + "$ref": "monomer3" + }, + { + "$ref": "monomer4" + }, + { + "$ref": "monomer5" + } + ], + "connections": [ + { + "connectionType": "single", + "endpoint1": { + "monomerId": "monomer0", + "attachmentPointId": "R2" + }, + "endpoint2": { + "monomerId": "monomer1", + "attachmentPointId": "R1" + } + }, + { + "connectionType": "single", + "endpoint1": { + "monomerId": "monomer1", + "attachmentPointId": "R2" + }, + "endpoint2": { + "monomerId": "monomer2", + "attachmentPointId": "R1" + } + }, + { + "connectionType": "single", + "endpoint1": { + "monomerId": "monomer3", + "attachmentPointId": "R2" + }, + "endpoint2": { + "monomerId": "monomer4", + "attachmentPointId": "R1" + } + }, + { + "connectionType": "single", + "endpoint1": { + "monomerId": "monomer4", + "attachmentPointId": "R2" + }, + "endpoint2": { + "monomerId": "monomer5", + "attachmentPointId": "R1" + } + } + ], + "templates": [ + { + "$ref": "monomerTemplate-G___Glycine" + }, + { + "$ref": "monomerTemplate-Mod0" + }, + { + "$ref": "monomerTemplate-C___Cysteine" + }, + { + "$ref": "monomerTemplate-Mod1" + } + ] + }, + "monomer0": { + "type": "monomer", + "id": "0", + "seqid": 1, + "position": { + "x": 0.000000, + "y": -0.000000 + }, + "alias": "G", + "templateId": "G___Glycine" + }, + "monomer1": { + "type": "monomer", + "id": "1", + "seqid": 2, + "position": { + "x": 1.600000, + "y": -0.000000 + }, + "alias": "Mod0", + "templateId": "Mod0" + }, + "monomer2": { + "type": "monomer", + "id": "2", + "seqid": 3, + "position": { + "x": 3.200000, + "y": -0.000000 + }, + "alias": "C", + "templateId": "C___Cysteine" + }, + "monomer3": { + "type": "monomer", + "id": "3", + "seqid": 4, + "position": { + "x": 0.000000, + "y": -1.600000 + }, + "alias": "G", + "templateId": "G___Glycine" + }, + "monomer4": { + "type": "monomer", + "id": "4", + "seqid": 5, + "position": { + "x": 1.600000, + "y": -1.600000 + }, + "alias": "Mod1", + "templateId": "Mod1" + }, + "monomer5": { + "type": "monomer", + "id": "5", + "seqid": 6, + "position": { + "x": 3.200000, + "y": -1.600000 + }, + "alias": "C", + "templateId": "C___Cysteine" + }, + "monomerTemplate-G___Glycine": { + "type": "monomerTemplate", + "id": "G___Glycine", + "class": "AminoAcid", + "classHELM": "PEPTIDE", + "fullName": "Glycine", + "alias": "G", + "naturalAnalogShort": "G", + "attachmentPoints": [ + { + "attachmentAtom": 4, + "type": "left", + "leavingGroup": { + "atoms": [ + 5 + ] + } + }, + { + "attachmentAtom": 1, + "type": "right", + "leavingGroup": { + "atoms": [ + 3 + ] + } + } + ], + "atoms": [ + { + "label": "C", + "location": [ + -0.336300, + 0.534600, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + 0.992900, + -0.110700, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 1.078200, + -1.289000, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 1.970900, + 0.552000, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -1.326000, + -0.110700, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + -2.379700, + 0.423800, + 0.000000 + ] + } + ], + "bonds": [ + { + "type": 1, + "atoms": [ + 0, + 1 + ] + }, + { + "type": 2, + "atoms": [ + 1, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 3 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 4, + 5 + ] + } + ] + }, + "monomerTemplate-Mod0": { + "type": "monomerTemplate", + "id": "Mod0", + "class": "AminoAcid", + "alias": "Mod0", + "attachmentPoints": [ + { + "attachmentAtom": 1, + "label": "R1", + "leavingGroup": { + "atoms": [ + 0 + ] + } + }, + { + "attachmentAtom": 5, + "label": "R2", + "leavingGroup": { + "atoms": [ + 6 + ] + } + } + ], + "atoms": [ + { + "label": "H", + "location": [ + -1.500000, + 0.866026, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -0.500000, + 0.866025, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + 0.000000, + 0.000000, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -0.500000, + -0.866025, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + -1.500000, + -0.866025, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + 1.000000, + 0.000000, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + 1.500000, + 0.866025, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 1.500000, + -0.866025, + 0.000000 + ] + } + ], + "bonds": [ + { + "type": 1, + "atoms": [ + 0, + 1 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 3 + ] + }, + { + "type": 2, + "atoms": [ + 3, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 5 + ] + }, + { + "type": 1, + "atoms": [ + 5, + 6 + ] + }, + { + "type": 2, + "atoms": [ + 5, + 7 + ] + } + ] + }, + "monomerTemplate-C___Cysteine": { + "type": "monomerTemplate", + "id": "C___Cysteine", + "class": "AminoAcid", + "classHELM": "PEPTIDE", + "fullName": "Cysteine", + "alias": "C", + "naturalAnalogShort": "C", + "attachmentPoints": [ + { + "attachmentAtom": 4, + "type": "left", + "leavingGroup": { + "atoms": [ + 7 + ] + } + }, + { + "attachmentAtom": 0, + "type": "right", + "leavingGroup": { + "atoms": [ + 6 + ] + } + }, + { + "attachmentAtom": 3, + "type": "side", + "leavingGroup": { + "atoms": [ + 8 + ] + } + } + ], + "atoms": [ + { + "label": "C", + "location": [ + 1.445700, + -1.133300, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + 0.145300, + -0.384000, + 0.000000 + ], + "stereoLabel": "abs" + }, + { + "label": "C", + "location": [ + 0.143000, + 1.116800, + 0.000000 + ] + }, + { + "label": "S", + "location": [ + -1.157300, + 1.866100, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -1.155100, + -1.133300, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 1.447500, + -2.333300, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 2.484200, + -0.532000, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + -2.194200, + -0.533100, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + -1.159100, + 3.066100, + 0.000000 + ] + } + ], + "bonds": [ + { + "type": 2, + "atoms": [ + 5, + 0 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 1 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 6 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 2 + ], + "stereo": 1 + }, + { + "type": 1, + "atoms": [ + 2, + 3 + ] + }, + { + "type": 1, + "atoms": [ + 4, + 7 + ] + }, + { + "type": 1, + "atoms": [ + 3, + 8 + ] + } + ] + }, + "monomerTemplate-Mod1": { + "type": "monomerTemplate", + "id": "Mod1", + "class": "AminoAcid", + "alias": "Mod1", + "attachmentPoints": [ + { + "attachmentAtom": 1, + "label": "R1", + "leavingGroup": { + "atoms": [ + 0 + ] + } + }, + { + "attachmentAtom": 5, + "label": "R2", + "leavingGroup": { + "atoms": [ + 6 + ] + } + } + ], + "atoms": [ + { + "label": "H", + "location": [ + -1.500000, + 0.866026, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -0.500000, + 0.866025, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + 0.000000, + 0.000000, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -0.500000, + -0.866025, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + -1.500000, + -0.866025, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + 1.000000, + 0.000000, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + 1.500000, + 0.866025, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 1.500000, + -0.866025, + 0.000000 + ] + } + ], + "bonds": [ + { + "type": 1, + "atoms": [ + 0, + 1 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 3 + ] + }, + { + "type": 2, + "atoms": [ + 3, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 5 + ] + }, + { + "type": 1, + "atoms": [ + 5, + 6 + ] + }, + { + "type": 2, + "atoms": [ + 5, + 7 + ] + } + ] + } +} \ No newline at end of file diff --git a/core/indigo-core/molecule/elements.h b/core/indigo-core/molecule/elements.h index 7a0771d7a3..895a18e97f 100644 --- a/core/indigo-core/molecule/elements.h +++ b/core/indigo-core/molecule/elements.h @@ -184,6 +184,7 @@ namespace indigo DECL_ERROR; static const char* toString(int element); + static const char* toString(int element, int isotope); static int fromString(const char* name); static int fromString2(const char* name); static int fromChar(char c); diff --git a/core/indigo-core/molecule/ket_objects.h b/core/indigo-core/molecule/ket_objects.h index dbbdc5c665..a66c9f0273 100644 --- a/core/indigo-core/molecule/ket_objects.h +++ b/core/indigo-core/molecule/ket_objects.h @@ -774,6 +774,8 @@ namespace indigo { groupId, monomerId, + moleculeId, + atomId, attachmentPointId }; }; diff --git a/core/indigo-core/molecule/monomers_template_library.h b/core/indigo-core/molecule/monomers_template_library.h index bd4fb3b3be..ffd7a07d53 100644 --- a/core/indigo-core/molecule/monomers_template_library.h +++ b/core/indigo-core/molecule/monomers_template_library.h @@ -132,6 +132,10 @@ namespace indigo _bonds = other._bonds; } + int AddAtom(const std::string& label, Vec3f location); + + int AddBond(int bond_type, int atom1, int atom2); + private: enum class StringProps { diff --git a/core/indigo-core/molecule/sequence_loader.h b/core/indigo-core/molecule/sequence_loader.h index f48ac0617e..1859f57048 100644 --- a/core/indigo-core/molecule/sequence_loader.h +++ b/core/indigo-core/molecule/sequence_loader.h @@ -82,7 +82,7 @@ namespace indigo void checkAddTemplate(BaseMolecule& mol, const MonomerTemplate& monomer_template); - const std::string& checkAddTemplate(KetDocument& document, MonomerClass monomer_class, const std::string alias); + const std::string& checkAddTemplate(KetDocument& document, MonomerClass monomer_class, const std::string& alias); void checkAddTemplate(KetDocument& document, const MonomerTemplate& monomer_template); void addAminoAcid(BaseMolecule& mol, char ch); @@ -115,8 +115,8 @@ namespace indigo size_t addKetMonomer(KetDocument& document, MonomerInfo info, MonomerClass monomer_class, const Vec3f& pos); int readCount(std::string& count, Scanner& _scanner); - MonomerInfo readHelmMonomer(MonomerClass monomer_class = MonomerClass::Unknown); - std::string readHelmMonomerAlias(); + MonomerInfo readHelmMonomer(KetDocument& document, MonomerClass monomer_class = MonomerClass::Unknown); + std::string readHelmMonomerAlias(KetDocument& document, MonomerClass monomer_class); std::string readHelmRepeating(); std::string readHelmAnnotation(); std::string readHelmSimplePolymerName(std::string& polymer_name); diff --git a/core/indigo-core/molecule/src/elements.cpp b/core/indigo-core/molecule/src/elements.cpp index 6e379e5c69..b5a8229c32 100644 --- a/core/indigo-core/molecule/src/elements.cpp +++ b/core/indigo-core/molecule/src/elements.cpp @@ -256,6 +256,18 @@ const char* Element::toString(int element) return _instance()._element_parameters.at(element).name; } +const char* Element::toString(int element, int isotope) +{ + if (element == ELEM_H) + { + if (isotope == DEUTERIUM) + return "D"; + if (isotope == TRITIUM) + return "T"; + } + return toString(element); +} + int Element::calcValenceOfAromaticAtom(int elem, int charge, int n_arom, int min_conn) { if (elem == ELEM_C) diff --git a/core/indigo-core/molecule/src/ket_document.cpp b/core/indigo-core/molecule/src/ket_document.cpp index 7c24d2eea6..cad1617652 100644 --- a/core/indigo-core/molecule/src/ket_document.cpp +++ b/core/indigo-core/molecule/src/ket_document.cpp @@ -390,39 +390,51 @@ void KetDocument::parseSimplePolymers(std::vector>& sequ { auto& ep1 = connection.ep1(); auto& ep2 = connection.ep2(); + bool has_mol_1 = ep1.hasStringProp("moleculeId"); bool has_mon_1 = ep1.hasStringProp("monomerId"); + bool has_mol_2 = ep2.hasStringProp("moleculeId"); bool has_mon_2 = ep2.hasStringProp("monomerId"); - if (has_mon_1 != has_mon_2) + if ((has_mon_1 || has_mol_1) != (has_mon_2 || has_mol_2)) throw Error("Connection with only one end point."); - if (!has_mon_1) + if (!(has_mon_1 || has_mol_1)) throw Error("Connection with empty point."); bool has_ap_1 = ep1.hasStringProp("attachmentPointId"); + bool has_atom_1 = ep1.hasStringProp("atomId"); bool has_ap_2 = ep2.hasStringProp("attachmentPointId"); - if (has_ap_1 != has_ap_2) + bool has_atom_2 = ep1.hasStringProp("atomId"); + if ((has_ap_1 || has_atom_1) != (has_ap_2 || has_atom_2)) throw Error("Connection with only one attachment point id."); - if (!has_ap_1) + if (!(has_ap_1 || has_atom_1)) throw Error("Connection with empty attachment point."); + if ((has_mon_1 != has_ap_1) || (has_mon_2 != has_ap_2)) + throw Error("Wrong connection point"); + auto& mon_ref_1 = has_mon_1 ? ep1.getStringProp("monomerId") : ep1.getStringProp("moleculeId"); + auto& mon_ref_2 = has_mon_2 ? ep2.getStringProp("monomerId") : ep2.getStringProp("moleculeId"); - auto& mon_ref_1 = ep1.getStringProp("monomerId"); - auto& mon_ref_2 = ep2.getStringProp("monomerId"); + auto& mon_id_1 = has_mon_1 ? _monomer_ref_to_id.at(mon_ref_1) : mon_ref_1; + auto& mon_id_2 = has_mon_2 ? _monomer_ref_to_id.at(mon_ref_2) : mon_ref_2; - auto& mon_id_1 = _monomer_ref_to_id.at(mon_ref_1); - auto& mon_id_2 = _monomer_ref_to_id.at(mon_ref_2); + // molecules saved in helm as CHEM + if (has_mol_1) + id_to_class.emplace(mon_ref_1, MonomerClass::CHEM); + if (has_mol_1) + id_to_class.emplace(mon_ref_2, MonomerClass::CHEM); auto& mon1_class = id_to_class.at(mon_id_1); auto& mon2_class = id_to_class.at(mon_id_2); - auto& ap_id_1 = ep1.getStringProp("attachmentPointId"); - auto& ap_id_2 = ep2.getStringProp("attachmentPointId"); + auto& ap_id_1 = has_mon_1 ? ep1.getStringProp("attachmentPointId") : ep1.getStringProp("atomId"); + auto& ap_id_2 = has_mon_1 ? ep2.getStringProp("attachmentPointId") : ep1.getStringProp("atomId"); ap_to_connection.emplace(std::make_pair(mon_id_1, ap_id_1), connection); ap_to_connection.emplace(std::make_pair(mon_id_2, ap_id_2), connection); bool sequence_connection = false; - if (for_idt) - sequence_connection = isIdtConnection(mon1_class, ap_id_1, mon2_class, ap_id_2); - else - sequence_connection = isSimplePolymerConnection(mon1_class, ap_id_1, mon2_class, ap_id_2); + if (has_mon_1 && has_mon_2) // any connection to molecule is not in sequence + if (for_idt) + sequence_connection = isIdtConnection(mon1_class, ap_id_1, mon2_class, ap_id_2); + else + sequence_connection = isSimplePolymerConnection(mon1_class, ap_id_1, mon2_class, ap_id_2); if (!sequence_connection) { _non_sequence_connections.emplace_back(connection); diff --git a/core/indigo-core/molecule/src/ket_objects.cpp b/core/indigo-core/molecule/src/ket_objects.cpp index 8853142e74..3848828075 100644 --- a/core/indigo-core/molecule/src/ket_objects.cpp +++ b/core/indigo-core/molecule/src/ket_objects.cpp @@ -609,6 +609,8 @@ const std::map& KetConnectionEndPoint::getStringPropStrToIdx() static std::map str_to_idx{ {"groupId", toUType(StringProps::groupId)}, {"monomerId", toUType(StringProps::monomerId)}, + {"moleculeId", toUType(StringProps::moleculeId)}, + {"atomId", toUType(StringProps::atomId)}, {"attachmentPointId", toUType(StringProps::attachmentPointId)}, }; return str_to_idx; diff --git a/core/indigo-core/molecule/src/molecule_json_saver.cpp b/core/indigo-core/molecule/src/molecule_json_saver.cpp index 3428d4614b..6ed85009d0 100644 --- a/core/indigo-core/molecule/src/molecule_json_saver.cpp +++ b/core/indigo-core/molecule/src/molecule_json_saver.cpp @@ -722,21 +722,8 @@ void MoleculeJsonSaver::saveAtoms(BaseMolecule& mol, JsonWriter& writer) } else if (anum != VALUE_UNKNOWN) { - buf.readString(Element::toString(anum), true); + buf.readString(Element::toString(anum, isotope), true); radical = mol.getAtomRadical(i); - if (anum == ELEM_H) - { - if (isotope == DEUTERIUM) - { - buf.clear(); - buf.appendString("D", true); - } - if (isotope == TRITIUM) - { - buf.clear(); - buf.appendString("T", true); - } - } } else if (_pqmol) { diff --git a/core/indigo-core/molecule/src/monomers_template_library.cpp b/core/indigo-core/molecule/src/monomers_template_library.cpp index 730d679673..422cbe9e76 100644 --- a/core/indigo-core/molecule/src/monomers_template_library.cpp +++ b/core/indigo-core/molecule/src/monomers_template_library.cpp @@ -53,6 +53,19 @@ namespace indigo return str_to_idx; }; + int MonomerTemplate::AddAtom(const std::string& label, Vec3f location) + { + _atoms.push_back(std::make_unique(label)); + (*_atoms.rbegin())->setLocation(location); + return static_cast(_atoms.size() - 1); + } + + int MonomerTemplate::AddBond(int bond_type, int atom1, int atom2) + { + _bonds.emplace_back(bond_type, atom1, atom2); + return static_cast(_bonds.size() - 1); + } + KetAttachmentPoint& MonomerTemplate::AddAttachmentPoint(const std::string& label, int att_atom) { std::string ap_id = label.size() != 0 ? label : "R" + std::to_string(1 + _attachment_points.size()); diff --git a/core/indigo-core/molecule/src/sequence_loader.cpp b/core/indigo-core/molecule/src/sequence_loader.cpp index 9352e5263d..443a888c36 100644 --- a/core/indigo-core/molecule/src/sequence_loader.cpp +++ b/core/indigo-core/molecule/src/sequence_loader.cpp @@ -24,12 +24,14 @@ #include "base_cpp/scanner.h" #include "layout/molecule_layout.h" #include "layout/sequence_layout.h" +#include "molecule/elements.h" #include "molecule/ket_commons.h" #include "molecule/ket_document.h" #include "molecule/molecule.h" #include "molecule/monomer_commons.h" #include "molecule/monomers_template_library.h" #include "molecule/sequence_loader.h" +#include "molecule/smiles_loader.h" using namespace indigo; @@ -569,7 +571,7 @@ void SequenceLoader::checkAddTemplate(KetDocument& document, const MonomerTempla } } -const std::string& SequenceLoader::checkAddTemplate(KetDocument& document, MonomerClass monomer_class, const std::string alias) +const std::string& SequenceLoader::checkAddTemplate(KetDocument& document, MonomerClass monomer_class, const std::string& alias) { auto& id = _library.getMonomerTemplateIdByAlias(monomer_class, alias); if (_added_templates.count(std::make_pair(monomer_class, alias)) == 0) @@ -580,7 +582,9 @@ const std::string& SequenceLoader::checkAddTemplate(KetDocument& document, Monom MonomerTemplate::MonomerClassToStr(monomer_class).c_str(), alias.c_str()); document.addMonomerTemplate(_library.getMonomerTemplateById(id)); } - return id; + if (id.size() > 0) + return id; + return alias; } void SequenceLoader::check_monomer_place(std::string& idt_alias, IdtModification mon_mod, IdtModification alias_mod, bool has_prev_mon) @@ -1264,7 +1268,7 @@ static std::set polymer_types{kHELMPolymerTypePEPTIDE, kHELMPolymer static const char* reserved_helm_chars = "${}|.,-:[]()"; static const char* unexpected_eod = unexpected_eod; -std::string SequenceLoader::readHelmMonomerAlias() +std::string SequenceLoader::readHelmMonomerAlias(KetDocument& document, MonomerClass monomer_class) { std::string monomer_alias; auto ch = _scanner.lookNext(); @@ -1280,10 +1284,13 @@ std::string SequenceLoader::readHelmMonomerAlias() { case '[': bracket_count++; + monomer_alias += ch; smiles = true; break; case ']': bracket_count--; + if (bracket_count > 0) + monomer_alias += ch; break; default: monomer_alias += ch; @@ -1295,7 +1302,66 @@ std::string SequenceLoader::readHelmMonomerAlias() if (ch != ']') throw Error("Unexpected char. Expected ']' but found '%c'.", ch); if (smiles) - throw Error("Inline smiles not supported for now."); + { + // Convert smiles to molecule + BufferScanner scanner(monomer_alias.c_str()); + SmilesLoader loader(scanner); + Molecule mol{}; + loader.loadMolecule(mol); + MoleculeLayout ml(mol, false); + ml.layout_orientation = UNCPECIFIED; + ml.make(); + // create template based on molecule + monomer_alias = "Mod" + std::to_string(_unknown_variants_count++); + auto& mon_template = document.addMonomerTemplate(monomer_alias, MonomerTemplate::MonomerClassToStr(monomer_class), IdtAlias()); + mon_template.setStringProp("alias", monomer_alias); + std::map rgroups; + std::map rg_to_attatom; + std::vector bonds; + for (auto i : mol.vertices()) + { + if (mol.isRSite(i)) + { + const auto& vertex = mol.getVertex(i); + if (vertex.degree() != 1) + throw Error("Attachment point should be connected to single atom"); + rg_to_attatom.emplace(i, vertex.neiVertex(vertex.neiBegin())); + rgroups.emplace(i, mol.getSingleAllowedRGroup(i)); + mon_template.AddAtom("H", mol.getAtomXyz(i)); + } + else + { + int anum = mol.getAtomNumber(i); + std::string label; + if (anum == VALUE_UNKNOWN) + throw Error("Unknown element"); + int isotope = mol.getAtomIsotope(i); + mon_template.AddAtom(Element::toString(anum, isotope), mol.getAtomXyz(i)); + } + } + for (auto i : mol.edges()) + { + auto edge = mol.getEdge(i); + int bond_order = mol.getBondOrder(i); + if (bond_order == BOND_ZERO) + { + bond_order = _BOND_COORDINATION; + const Edge& edge = mol.getEdge(i); + if ((mol.getAtomNumber(edge.beg) == ELEM_H) || (mol.getAtomNumber(edge.end) == ELEM_H)) + bond_order = _BOND_HYDROGEN; + } + mon_template.AddBond(bond_order, edge.beg, edge.end); + } + for (auto& it : rgroups) + { + std::string label = 'R' + std::to_string(it.second); + auto& att_point = mon_template.AddAttachmentPoint(label, rg_to_attatom.at(it.first)); + std::vector lg; + lg.emplace_back(it.first); + att_point.setLeavingGroup(lg); + } + _added_templates.emplace(monomer_class, monomer_alias); + } } else if (ch != -1) { @@ -1360,7 +1426,7 @@ int SequenceLoader::readCount(std::string& count, Scanner& _scanner) return ch; } -SequenceLoader::MonomerInfo SequenceLoader::readHelmMonomer(MonomerClass monomer_class) +SequenceLoader::MonomerInfo SequenceLoader::readHelmMonomer(KetDocument& document, MonomerClass monomer_class) { std::string monomer_alias, repeating, annotation; variant_template_opts options; @@ -1371,7 +1437,7 @@ SequenceLoader::MonomerInfo SequenceLoader::readHelmMonomer(MonomerClass monomer _scanner.skip(1); was_bracket = true; } - monomer_alias = readHelmMonomerAlias(); + monomer_alias = readHelmMonomerAlias(document, monomer_class); ch = _scanner.lookNext(); bool is_variant = false; @@ -1404,7 +1470,7 @@ SequenceLoader::MonomerInfo SequenceLoader::readHelmMonomer(MonomerClass monomer } if (ch == ')') break; - opt_alias = readHelmMonomerAlias(); + opt_alias = readHelmMonomerAlias(document, monomer_class); if (aliases.count(opt_alias) > 0) throw Error("Ivalid variant monomer. Monomer '%s' repeated more than once.", opt_alias.c_str()); ch = readCount(count, _scanner); @@ -1471,367 +1537,6 @@ std::string SequenceLoader::readHelmSimplePolymerName(std::string& polymer_name) return polymer_type; } -void SequenceLoader::loadHELM(BaseMolecule& mol) -{ - _row = 0; - mol.clear(); - std::string simple_polymer_name = ""; - std::string simple_polymer_type = ""; - int monomer_idx = 0; - int prev_monomer_template_atom_idx = -1; - using polymer_map = std::map>; - polymer_map used_polymer_nums; - polymer_map::iterator cur_polymer_map; - enum class helm_parts - { - ListOfSimplePolymers, - ListOfConnections, - ListOfPolymerGroups, - ExtendedAnnotation, - End - }; - helm_parts helm_part = helm_parts::ListOfSimplePolymers; - - while (!_scanner.isEOF()) - { - if (helm_part == helm_parts::ListOfSimplePolymers) - { - auto ch = _scanner.lookNext(); - if (simple_polymer_name.size() == 0) // Read simple polymer_name - { - _col = 0; - simple_polymer_type = readHelmSimplePolymerName(simple_polymer_name); - if (used_polymer_nums.count(simple_polymer_name)) - throw Error("Simple polymer '%s' defined more than once.", simple_polymer_name.c_str()); - if (simple_polymer_name == simple_polymer_type) - throw Error("Polymer '%s' without number not allowed.", simple_polymer_name.c_str()); - ch = _scanner.lookNext(); - if (ch != '{') - throw Error("Unexpected symbol. Expected '{' but found '%c'.", ch); - _scanner.skip(1); // skip '{' - if (used_polymer_nums.count(simple_polymer_name)) - throw Error("Simple polymer '%s' defined more than once.", simple_polymer_name.c_str()); - auto res = used_polymer_nums.emplace(std::make_pair(simple_polymer_name, std::map())); - if (res.second) - cur_polymer_map = res.first; - else - throw Error("Internal error - cannot emplace polymer map."); - } - else if (ch == '(') - { - throw Error("Unexpected symbol '('. Group not supported for now."); - } - else if (ch != '}') - { - monomer_idx++; - Vec3f pos(_col * MoleculeLayout::DEFAULT_BOND_LENGTH, -MoleculeLayout::DEFAULT_BOND_LENGTH * _row, 0); - _col++; - if (simple_polymer_type == kHELMPolymerTypeUnknown) - { - Array name; - _scanner.readWord(name, reserved_helm_chars); - // skip blob for now - ch = _scanner.lookNext(); - if (ch != '}') - throw Error("Unexpected symbol. Expected '}' but found '%c'.", ch); - } - else if (simple_polymer_type == kHELMPolymerTypeCHEM) - { - auto [id, repeating, annotaion, options] = readHelmMonomer(); - ch = _scanner.lookNext(); - if (ch != '}') - throw Error("Unexpected symbol. Expected '}' but found '%c'.", ch); // only one monomer in chem - if (repeating.size()) - throw Error("Chem cannot be repeated."); - const std::string& monomer_id = _library.getMonomerTemplateIdByAlias(MonomerClass::CHEM, id); - if (monomer_id.size() == 0) // if not found - check for atom mapped SMILES([*:1]) and CXSMILES([*]...[*] |$_R1;;;;_R2;$|) - not now - throw Error("Monomer '%s' not found.", id.c_str()); - checkAddTemplate(mol, _library.getMonomerTemplateById(monomer_id)); - int chem_idx = mol.asMolecule().addAtom(-1); - mol.asMolecule().setTemplateAtom(chem_idx, id.c_str()); - mol.asMolecule().setTemplateAtomClass(chem_idx, kMonomerClassCHEM); - mol.asMolecule().setAtomXyz(chem_idx, pos); - cur_polymer_map->second[monomer_idx] = chem_idx; - } - else if (simple_polymer_type == kHELMPolymerTypePEPTIDE) - { - auto [id, repeating, annotaion, options] = readHelmMonomer(); - const std::string& monomer_id = _library.getMonomerTemplateIdByAlias(MonomerClass::AminoAcid, id); - if (monomer_id.size() == 0) // if not found - check for atom mapped SMILES([*:1]) and CXSMILES([*]...[*] |$_R1;;;;_R2;$|) - not now - throw Error("Monomer '%s' not found.", id.c_str()); - if (repeating.size()) - throw Error("Repeating do not supported now."); - checkAddTemplate(mol, _library.getMonomerTemplateById(monomer_id)); - int amino_idx = mol.asMolecule().addAtom(-1); - mol.asMolecule().setTemplateAtom(amino_idx, id.c_str()); - mol.asMolecule().setTemplateAtomClass(amino_idx, kMonomerClassAA); - mol.asMolecule().setTemplateAtomSeqid(amino_idx, monomer_idx); - mol.asMolecule().setAtomXyz(amino_idx, pos); - cur_polymer_map->second[monomer_idx] = amino_idx; - if (monomer_idx > 1) - { - mol.asMolecule().addBond_Silent(amino_idx - 1, amino_idx, BOND_SINGLE); - mol.setTemplateAtomAttachmentOrder(amino_idx - 1, amino_idx, kRightAttachmentPoint); - mol.setTemplateAtomAttachmentOrder(amino_idx, amino_idx - 1, kLeftAttachmentPoint); - } - ch = _scanner.lookNext(); - if (ch == '.') - _scanner.skip(1); - } - else // kHELMPolymerTypeRNA - { - auto [id, repeating, annotaion, options] = readHelmMonomer(); - const std::string& phosphate_lib_id = _library.getMonomerTemplateIdByAlias(MonomerClass::Phosphate, id); - if (phosphate_lib_id.size()) - { - if (repeating.size()) - throw Error("Phosphate cannot be repeated."); - // add phosphate - checkAddTemplate(mol, _library.getMonomerTemplateById(phosphate_lib_id)); - int phosphate_idx = mol.asMolecule().addAtom(-1); - mol.asMolecule().setTemplateAtom(phosphate_idx, id.c_str()); - mol.asMolecule().setTemplateAtomClass(phosphate_idx, kMonomerClassPHOSPHATE); - mol.asMolecule().setTemplateAtomSeqid(phosphate_idx, monomer_idx); - mol.asMolecule().setAtomXyz(phosphate_idx, pos); - cur_polymer_map->second[monomer_idx] = phosphate_idx; - if (monomer_idx > 1) - { - mol.asMolecule().addBond_Silent(phosphate_idx - 1, phosphate_idx, BOND_SINGLE); - mol.setTemplateAtomAttachmentOrder(phosphate_idx - 1, phosphate_idx, kRightAttachmentPoint); - mol.setTemplateAtomAttachmentOrder(phosphate_idx, phosphate_idx - 1, kLeftAttachmentPoint); - } - ch = _scanner.lookNext(); - if (ch != '.' && ch != '}') - throw Error("Unexpected symbol. Expected '.' or '}' but found '%c'.", ch); - if (ch == '.') - _scanner.skip(1); - continue; - } - const std::string& sugar_id = _library.getMonomerTemplateIdByAlias(MonomerClass::Sugar, id); - if (sugar_id.size() == 0) // if not found - check for atom mapped SMILES([*:1]) and CXSMILES([*]...[*] |$_R1;;;;_R2;$|) - not now - throw Error("Sugar '%s' not found.", id.c_str()); - if (repeating.size()) - throw Error("Sugar cannot be repeated."); - checkAddTemplate(mol, _library.getMonomerTemplateById(sugar_id)); - int sugar_idx = mol.asMolecule().addAtom(-1); - mol.asMolecule().setTemplateAtom(sugar_idx, id.c_str()); - mol.asMolecule().setTemplateAtomClass(sugar_idx, kMonomerClassSUGAR); - mol.asMolecule().setTemplateAtomSeqid(sugar_idx, monomer_idx); - mol.asMolecule().setAtomXyz(sugar_idx, pos); - cur_polymer_map->second[monomer_idx] = sugar_idx; - if (monomer_idx > 1) - { - mol.asMolecule().addBond_Silent(sugar_idx - 1, sugar_idx, BOND_SINGLE); - mol.setTemplateAtomAttachmentOrder(sugar_idx - 1, sugar_idx, kRightAttachmentPoint); - mol.setTemplateAtomAttachmentOrder(sugar_idx, sugar_idx - 1, kLeftAttachmentPoint); - } - ch = _scanner.lookNext(); - if (ch == '(') // In RNA after sugar could be base in () - { - _scanner.skip(1); - monomer_idx++; - auto [base_id, base_repeating, base_annotaion, base_options] = readHelmMonomer(); - ch = _scanner.lookNext(); - if (ch != ')') - throw Error("Expected ')' after base but found '%c'.", ch); - _scanner.skip(1); - ch = _scanner.lookNext(); - if (repeating.size()) - throw Error("Base cannot be repeated."); - const std::string& base_lib_id = _library.getMonomerTemplateIdByAlias(MonomerClass::Base, base_id); - if (base_lib_id.size() == 0) // if not found - check for atom mapped SMILES([*:1]) and CXSMILES([*]...[*] |$_R1;;;;_R2;$|) - not now - throw Error("Base '%s' not found.", base_id.c_str()); - if (base_repeating.size()) - throw Error("Base cannot be repeated."); - checkAddTemplate(mol, _library.getMonomerTemplateById(base_lib_id)); - Vec3f base_pos((_col - 1) * MoleculeLayout::DEFAULT_BOND_LENGTH, -MoleculeLayout::DEFAULT_BOND_LENGTH * (_row + 1), 0); - int base_idx = mol.asMolecule().addAtom(-1); - mol.asMolecule().setTemplateAtom(base_idx, base_id.c_str()); - mol.asMolecule().setTemplateAtomClass(base_idx, kMonomerClassBASE); - mol.asMolecule().setTemplateAtomSeqid(base_idx, monomer_idx); - mol.asMolecule().setAtomXyz(base_idx, base_pos); - cur_polymer_map->second[monomer_idx] = base_idx; - mol.asMolecule().addBond_Silent(sugar_idx, base_idx, BOND_SINGLE); - mol.setTemplateAtomAttachmentOrder(sugar_idx, base_idx, kBranchAttachmentPoint); - mol.setTemplateAtomAttachmentOrder(base_idx, sugar_idx, kLeftAttachmentPoint); - } - if (ch == '.') - { - _scanner.skip(1); - continue; - } - if (ch == '}') - continue; - auto [phosphate_id, phosphate_repeating, phosphate_annotaion, phosphate_options] = readHelmMonomer(); - const std::string& phosp_id = _library.getMonomerTemplateIdByAlias(MonomerClass::Phosphate, phosphate_id); - if (phosp_id.size() == 0) - throw Error("Phosphate '%s' not found.", phosphate_id.c_str()); - if (repeating.size()) - throw Error("Phosphate cannot be repeated."); - monomer_idx++; - checkAddTemplate(mol, _library.getMonomerTemplateById(phosp_id)); - Vec3f phosphate_pos(_col * MoleculeLayout::DEFAULT_BOND_LENGTH, -MoleculeLayout::DEFAULT_BOND_LENGTH * _row, 0); - _col++; - int phosphate_idx = mol.asMolecule().addAtom(-1); - mol.asMolecule().setTemplateAtom(phosphate_idx, phosphate_id.c_str()); - mol.asMolecule().setTemplateAtomClass(phosphate_idx, kMonomerClassPHOSPHATE); - mol.asMolecule().setTemplateAtomSeqid(phosphate_idx, monomer_idx); - mol.asMolecule().setAtomXyz(phosphate_idx, phosphate_pos); - cur_polymer_map->second[monomer_idx] = phosphate_idx; - mol.asMolecule().addBond_Silent(sugar_idx, phosphate_idx, BOND_SINGLE); - mol.setTemplateAtomAttachmentOrder(sugar_idx, phosphate_idx, kRightAttachmentPoint); - mol.setTemplateAtomAttachmentOrder(phosphate_idx, sugar_idx, kLeftAttachmentPoint); - ch = _scanner.lookNext(); - if (ch != '.' && ch != '}') - throw Error("Unexpected symbol. Expected '.' or '}' but found '%c'.", ch); - if (ch == '.') - _scanner.skip(1); - } - } - else // end of polymer - } - { - _scanner.skip(1); // skip '}' - ch = _scanner.lookNext(); - if (ch == '"') - { - Array annotation; - _scanner.skip(1); - _scanner.readWord(annotation, "\""); - if (_scanner.lookNext() != '"') - throw Error("Unexpected symbol. Expected '\"' but found '%c'.", _scanner.lookNext()); - _scanner.skip(1); - // skip annotation for now - ch = _scanner.lookNext(); - } - _row++; - _col = 0; - monomer_idx = 0; - if (simple_polymer_type == kHELMPolymerTypeRNA) - _row++; // additional row for bases in RNA - if (ch == '|') - { - // cleanup to go to next simple polymer - simple_polymer_name = ""; - simple_polymer_type = ""; - } - else if (ch == '$') - { - helm_part = helm_parts::ListOfConnections; - } - else if (ch == -1) - { - throw Error(unexpected_eod); - } - else - { - throw Error("Unexpected symbol. Expected '|' or '$' but found '%c'.", ch); - } - _scanner.skip(1); - } - } - else if (helm_part == helm_parts::ListOfConnections) - { - auto ch = _scanner.lookNext(); - if (ch == '$') - { - helm_part = helm_parts::ListOfPolymerGroups; - _scanner.skip(1); - continue; - } - // CHEM1,RNA1,32:R1-12:R2"annotation"|..... - std::string left_polymer, right_polymer; - std::ignore = readHelmSimplePolymerName(left_polymer); - auto left_polymer_nums = used_polymer_nums.find(left_polymer); - if (left_polymer_nums == used_polymer_nums.end()) - throw Error("Polymer '%s' not found.", left_polymer.c_str()); - ch = _scanner.lookNext(); - if (ch != ',') - throw Error("Unexpected symbol. Expected ',' but found '%c'.", _scanner.lookNext()); - _scanner.skip(1); - std::ignore = readHelmSimplePolymerName(right_polymer); - auto right_polymer_nums = used_polymer_nums.find(right_polymer); - if (right_polymer_nums == used_polymer_nums.end()) - throw Error("Polymer '%s' not found.", right_polymer.c_str()); - ch = _scanner.lookNext(); - if (ch != ',') - throw Error("Unexpected symbol. Expected ',' but found '%c'.", _scanner.lookNext()); - _scanner.skip(1); - // read monomer position - int left_monomer_idx, right_monomer_idx; - Array left_ap, right_ap; - Array position; - size_t error_pos; - _scanner.readWord(position, ":"); - _scanner.skip(1); - left_monomer_idx = std::stoi(position.ptr(), &error_pos); - if (error_pos != position.size() - 1) // arrray contains 0 at the end - throw Error("Only direct connections supported now."); - _scanner.readWord(left_ap, "-"); - _scanner.skip(1); - position.clear(); - _scanner.readWord(position, ":"); - _scanner.skip(1); - right_monomer_idx = std::stoi(position.ptr(), &error_pos); - if (error_pos != position.size() - 1) // arrray contains 0 at the end - throw Error("Only direct connections supported now."); - _scanner.readWord(right_ap, "\"|$"); - auto left_mon_it = left_polymer_nums->second.find(left_monomer_idx); - if (left_mon_it == left_polymer_nums->second.end()) - throw Error("Polymer '%s' does not contains monomer with number %d.", left_polymer.c_str(), left_monomer_idx); - int left_templ_atom_idx = left_mon_it->second; - auto right_mon_it = right_polymer_nums->second.find(right_monomer_idx); - if (right_mon_it == right_polymer_nums->second.end()) - throw Error("Polymer '%s' does not contains monomer with number %d.", right_polymer.c_str(), right_monomer_idx); - int right_templ_atom_idx = right_mon_it->second; - mol.asMolecule().addBond_Silent(left_templ_atom_idx, right_templ_atom_idx, BOND_SINGLE); - mol.setTemplateAtomAttachmentOrder(left_templ_atom_idx, right_templ_atom_idx, convertAPFromHELM(left_ap.ptr()).c_str()); - mol.setTemplateAtomAttachmentOrder(right_templ_atom_idx, left_templ_atom_idx, convertAPFromHELM(right_ap.ptr()).c_str()); - if (_scanner.isEOF()) - throw Error(unexpected_eod); - ch = _scanner.readChar(); - if (ch == '"') - { - Array annotation; - _scanner.readWord(annotation, "\""); - if (_scanner.isEOF()) - throw Error(unexpected_eod); - if (_scanner.lookNext() != '"') - throw Error("Unexpected char. Expected '\"' but found '%c'.", _scanner.lookNext()); - _scanner.skip(1); // skip '"' - if (_scanner.isEOF()) - throw Error(unexpected_eod); - ch = _scanner.readChar(); - } - if (ch != '|' && ch != '$') - throw Error("Unexpected symbol. Expected '|' or '$' but found '%c'.", _scanner.lookNext()); - } - else if (helm_part == helm_parts::ListOfPolymerGroups) - { - Array groups; - _scanner.readWord(groups, "$"); - // skip groups for now - helm_part = helm_parts::ExtendedAnnotation; - } - else // helm_parts::ExtendedAnnotation - { - // read rest of data - std::string rest_of_helm; - _scanner.readAll(rest_of_helm); - auto it = rest_of_helm.find_last_of('$'); - if (it == rest_of_helm.npos) - throw Error("Incorrect format. Last '$' not found."); - std::string signature = rest_of_helm.substr(it + 1); - // split by last '$' and check if right part eq “V2.0” - // if (signature != "v2.0") - // throw Error("Expected HELM V2.0 but got '%s'.", signature.c_str()); - // check that left part is valid json - TODO - helm_part = helm_parts::End; - } - } - if (helm_part != helm_parts::End) - throw Error(unexpected_eod); -} - const std::string SequenceLoader::checkAddVariantMonomerTemplate(KetDocument& document, const std::string& alias, MonomerClass monomer_class, variant_template_opts& options) { @@ -1961,7 +1666,7 @@ void SequenceLoader::loadHELM(KetDocument& document) continue; } const auto& monomer_class = MonomerTemplates::getStrToMonomerType().at(simple_polymer_type); - auto monomer_info = readHelmMonomer(monomer_class); + auto monomer_info = readHelmMonomer(document, monomer_class); if (monomer_class == MonomerClass::CHEM) { ch = _scanner.lookNext(); @@ -2004,7 +1709,7 @@ void SequenceLoader::loadHELM(KetDocument& document) if (ch == '(') // In RNA after sugar could be base in () { monomer_idx++; - auto base_info = readHelmMonomer(MonomerClass::Base); + auto base_info = readHelmMonomer(document, MonomerClass::Base); ch = _scanner.lookNext(); Vec3f base_pos(pos.x, pos.y - MoleculeLayout::DEFAULT_BOND_LENGTH, 0); auto base_idx = addKetMonomer(document, base_info, MonomerClass::Base, base_pos); @@ -2019,7 +1724,7 @@ void SequenceLoader::loadHELM(KetDocument& document) } if (ch == '}') continue; - auto phosphate_info = readHelmMonomer(MonomerClass::Phosphate); + auto phosphate_info = readHelmMonomer(document, MonomerClass::Phosphate); monomer_idx++; Vec3f phosphate_pos(_col * MoleculeLayout::DEFAULT_BOND_LENGTH, -MoleculeLayout::DEFAULT_BOND_LENGTH * _row, 0); _col++; diff --git a/core/indigo-core/molecule/src/sequence_saver.cpp b/core/indigo-core/molecule/src/sequence_saver.cpp index ae4b215a85..073ca04a96 100644 --- a/core/indigo-core/molecule/src/sequence_saver.cpp +++ b/core/indigo-core/molecule/src/sequence_saver.cpp @@ -20,11 +20,14 @@ #include "base_cpp/output.h" #include "base_cpp/scanner.h" #include "layout/sequence_layout.h" +#include "molecule/elements.h" #include "molecule/ket_document.h" #include "molecule/ket_objects.h" #include "molecule/molecule.h" +#include "molecule/molecule_json_loader.h" #include "molecule/monomer_commons.h" #include "molecule/monomers_template_library.h" +#include "molecule/smiles_saver.h" using namespace indigo; @@ -1086,6 +1089,7 @@ std::string SequenceSaver::saveHELM(KetDocument& document, std::vector> mol_atom_to_ap; for (auto& sequence : sequences) { int monomer_idx = 0; @@ -1159,6 +1163,83 @@ std::string SequenceSaver::saveHELM(KetDocument& document, std::vectorsgroups; + for (int i = sgroups.begin(); i != sgroups.end(); i = sgroups.next(i)) + { + auto& sgroup = sgroups.getSGroup(i); + if (sgroup.sgroup_type != SGroup::SG_TYPE_SUP) + continue; + Superatom& sa = static_cast(sgroup); + if (sa.subscript.size() != 0 && sa.subscript.ptr()[0] != 0) + continue; + // convert leaving atom H to rg-ref + auto& atom_to_ap = mol_atom_to_ap.try_emplace(mol_id).first; + static std::string apid_prefix{'R'}; + Array leaving_atoms; + for (int ap_id = sa.attachment_points.begin(); ap_id != sa.attachment_points.end(); ap_id = sa.attachment_points.next(ap_id)) + { + auto& ap = sa.attachment_points.at(ap_id); + std::string apid = apid_prefix + ap.apid.ptr(); + atom_to_ap->second.emplace(ap.aidx, apid); + int leaving_atom = ap.lvidx; + int ap_idx = std::stoi(ap.apid.ptr()); + if (pbmol == &mol) + { + mol.resetAtom(leaving_atom, ELEM_RSITE); + mol.allowRGroupOnRSite(leaving_atom, ap_idx); + } + else + { + auto rsite = std::make_unique(QueryMolecule::ATOM_RSITE, 0); + qmol.resetAtom(leaving_atom, rsite.release()); + qmol.allowRGroupOnRSite(leaving_atom, ap_idx); + } + } + sgroups.remove(i); + } + // generate smiles + std::string smiles; + StringOutput s_out(smiles); + SmilesSaver saver(s_out); + if (pbmol == &mol) + saver.saveMolecule(mol); + else + saver.saveQueryMolecule(qmol); + // save as chem + if (helm_string.size() > 0) + helm_string += '|'; + helm_string += "CHEM"; + polymer_idx = ++chem_idx; + helm_string += std::to_string(polymer_idx); + helm_string += "{["; + helm_string += smiles; + helm_string += '}'; + monomer_id_to_monomer_info.emplace(std::make_pair(mol_id, std::make_tuple(HELMType::Chem, polymer_idx, 1))); + } } helm_string += '$'; // Add connections @@ -1170,12 +1251,16 @@ std::string SequenceSaver::saveHELM(KetDocument& document, std::vector