From 14510ccbabb83b12bcc2f5c481e77e4d05e88452 Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Wed, 11 Sep 2024 16:52:54 -0400 Subject: [PATCH] Squashed commit of the following: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 83a88636b885ac0611d3654abbe378a4916dcbf3 Merge: b124dfb f4e4003 Author: beasleyjonm <85600465+beasleyjonm@users.noreply.github.com> Date: Wed Sep 11 12:27:06 2024 -0400 Merge pull request #256 from RobokopU24/PHAROS_target_for_edges Add "target_for" edges from loadPHAROS.py commit f4e4003f66a800472e6ca4578a0b94061c312ad1 Author: beasleyjonm <85600465+beasleyjonm@users.noreply.github.com> Date: Fri Aug 23 13:47:54 2024 -0400 Add "target_for" edges from loadPHAROS.py commit b124dfb1921b1ffb60cfdf3ac964b647c4db1110 Merge: 74ea571 0a22609 Author: beasleyjonm <85600465+beasleyjonm@users.noreply.github.com> Date: Tue Aug 13 11:24:52 2024 -0400 Merge pull request #252 from RobokopU24/neo4j_5 Neo4j 5 commit 74ea571580e746863a24fb80c7d190123b8b51e3 Merge: 6836058 eeacd0b Author: Evan Morris Date: Tue Aug 13 10:58:48 2024 -0400 Merge pull request #245 from RobokopU24/DnlRKorn-patch-7 Set filenames to be based on object settings in all functions loadHistoneMap.py commit 6836058361ce968f553b47a92e526595409783ec Merge: 0f389ab 2f0d2e7 Author: Evan Morris Date: Tue Aug 13 10:57:17 2024 -0400 Merge pull request #239 from RobokopU24/DnlRKorn-patch-1 Update loadFDB.py commit 0f389abe69a1698763bd53beaa91df386ec7484d Merge: 9f75017 f83234c Author: Evan Morris Date: Tue Aug 13 10:54:42 2024 -0400 Merge pull request #241 from RobokopU24/DnlRKorn-patch-3 Added TODO to loadGTEx.py commit 9f750170d89d96e2eb78ed0440873c10027d9331 Merge: 0cf45a0 a45698f Author: Evan Morris Date: Tue Aug 13 10:50:59 2024 -0400 Merge pull request #243 from RobokopU24/DnlRKorn-patch-5 Update loadIA.py commit 0cf45a030afdf95ec06bbd85a3779b26da9a7773 Merge: 8db9527 827751d Author: Evan Morris Date: Tue Aug 13 10:50:14 2024 -0400 Merge pull request #244 from RobokopU24/DnlRKorn-patch-6 Added TODO to loadPanther.py commit 8db95272bf3ef82615e20d26a976e90c9464489c Merge: e94d556 40cad6f Author: Evan Morris Date: Tue Aug 13 10:49:25 2024 -0400 Merge pull request #246 from RobokopU24/DnlRKorn-patch-8 Added TODO to loadUniRef.py commit 0a226095eff9329700c25854c0b64c9f29c3e9dc Merge: 77c92b3 e94d556 Author: Evan Morris Date: Tue Aug 13 10:47:12 2024 -0400 Merge branch 'master' into neo4j_5 commit 77c92b3147448793a9b3707c9d45b62607a9e7df Author: Evan Morris Date: Tue Aug 13 10:45:36 2024 -0400 changed bd_file_name in accordance with changes in BINDING (no more version in file name), also reverting some changes see description I had made a change that would store the downloaded files with consistent names so that get_latest_version would be called fewer times, but it's not really worth it because it's slow to recompress the archive and it doesn't prevent all calls to latest version.. there are better ways to do this commit 5acb0520cc3049ca9ecc9418e80ead5179fd953a Author: Evan Morris Date: Mon Aug 12 19:05:43 2024 -0400 removing import, not in use (yet) commit 17ff6aead8e7ffd54c441463bc8c30f0042bd403 Author: Evan Morris Date: Mon Aug 12 18:48:39 2024 -0400 making quote usage consistent commit 487863bf94d8312be6d06d9bd775006560be1b5d Author: Evan Morris Date: Mon Aug 12 18:14:03 2024 -0400 upgrading Reactome to neo4j 5, adding 4 to 5 migration step commit 6af16359bc082e55f32ca1105f108150ce04bb0e Author: Evan Morris Date: Mon Aug 12 17:18:22 2024 -0400 commenting these out so they are truly optional by default commit 7540dde819a05b3708e7806e8a7c0cfe14e316ec Author: Evan Morris Date: Mon Aug 12 17:17:35 2024 -0400 adding edge id commit daa90bc4f68cf4a4495d05a7f3a363d36d7b3cac Author: Evan Morris Date: Thu Aug 8 16:29:08 2024 -0400 fixing literal comparison commit 7268385336e0e2382d473631bfdb59e936880812 Author: Evan Morris Date: Thu Aug 8 14:45:17 2024 -0400 oops, fixing source id commit 222db0cab121aa0a7c54015f92ce870e3337564e Author: Evan Morris Date: Thu Aug 8 14:22:49 2024 -0400 adding a spec for ctkp and changing source id to ClinicalTrialsKP in the mappings commit e94d556ede61fb6fa9a9e23b35201501a05e1fdd Merge: 8d8b643 f369559 Author: Evan Morris Date: Thu Aug 8 14:13:16 2024 -0400 Merge pull request #250 from RobokopU24/docs-fix fixing remnants of data_services commit f369559cdf103aff18311dc11dffa3e367615d4e Author: Evan Morris Date: Thu Aug 8 14:06:17 2024 -0400 fixing remnants of data_services commit db4c548ee214b44b9f7788ec1e2d9eddc73e2fef Author: Evan Morris Date: Thu Aug 8 13:51:37 2024 -0400 adding CTKP commit 5bf7bda6b5fb56ce06d6bedea55e723598031fd3 Author: Evan Morris Date: Tue Aug 6 16:10:06 2024 -0400 merging of properties more robust, avoid failure when there are properties to merge where only one is a list commit 265e4e8969a9e9aad756d6d06a13fcb545a7094b Author: Evan Morris Date: Fri Aug 2 02:08:54 2024 -0400 bumping bl versions commit 8422572d3d045c328acdc59403caa7394fd99268 Author: Evan Morris Date: Fri Aug 2 01:36:37 2024 -0400 allowing nodes with no names, with the condition they get their id set as the name commit 669de091ecd720b0c76c0375c9a94621d80a66a1 Author: Evan Morris Date: Wed Jul 31 12:29:23 2024 -0400 fixing load argument from-path for neo4j 5 commit 40cad6f23987d1fa4e7e8cb9ef7c5f7008591bed Author: DnlRKorn <6885702+DnlRKorn@users.noreply.github.com> Date: Tue Jul 30 16:52:44 2024 -0400 Added TODO to loadUniRef.py commit eeacd0ba75c77d7bf68c998f6fcb2081151fbcdc Author: DnlRKorn <6885702+DnlRKorn@users.noreply.github.com> Date: Fri Jul 26 16:41:36 2024 -0400 Set filenames to be based on object settings in all functions loadHistoneMap.py commit 827751d6e192d6cfa042b2ae27d6e9bd9a3e4405 Author: DnlRKorn <6885702+DnlRKorn@users.noreply.github.com> Date: Fri Jul 26 10:19:57 2024 -0400 Added TODO to loadPanther.py commit a45698f18eb7472574b59058639705eeb9707e81 Author: DnlRKorn <6885702+DnlRKorn@users.noreply.github.com> Date: Fri Jul 26 10:11:53 2024 -0400 Update loadIA.py Changed hardcoded URLs to class variables. commit 350f7b6d4bb5fa99170a6e31d89a297d08ea6f2d Author: Evan Morris Date: Thu Jul 25 14:50:44 2024 -0400 reverting to sequential node norm calls commit f83234ced566138224a3b04db1289a630d253950 Author: DnlRKorn <6885702+DnlRKorn@users.noreply.github.com> Date: Thu Jul 25 12:08:30 2024 -0400 Added TODO to loadGTEx.py commit 2f0d2e7a813712a2df7a1fa4c4470c6f395cbaa7 Author: DnlRKorn <6885702+DnlRKorn@users.noreply.github.com> Date: Thu Jul 25 11:21:21 2024 -0400 Update loadFDB.py Added small safety check in get_data. commit 1613f623c66c16b1fb57d171f930dca74c2ac16b Author: Evan Morris Date: Wed Jul 24 15:04:45 2024 -0400 fixing pool size and making backoff factor slower for node norm commit 3302b5a5ab784fbf2c2c04dd92297e779f8214c2 Author: Evan Morris Date: Tue Jul 23 15:28:25 2024 -0400 Squashed commit of the following: commit 8d8b643284e70e23c6bb5e2bb48425c9bc949ee4 Merge: 1f01b43 b0cf278 Author: Evan Morris Date: Tue Jul 23 15:01:08 2024 -0400 Merge pull request #230 from RobokopU24/drugcentral_treats_refactor Refactored treats edges for DrugCentral to differentiate between indi… commit b0cf278e02ffffd2b64834a8f57894dcd7adc72e Author: eKathleenCarter <163005214+eKathleenCarter@users.noreply.github.com> Date: Tue Jul 23 12:04:39 2024 -0400 Update loaddrugcentral.py kathleen modifications changed the mapping of symptomatic treatment to RO:0002606 (instead of RO:0003307) changed the mapping of diagnosis to DrugCentral:5271 commit 1f01b43a842706e9cb5723c88c3bbd8371ff505a Merge: ec6b7d8 734cc6e Author: Evan Morris Date: Fri Jul 19 14:38:15 2024 -0400 Merge pull request #232 from RobokopU24/binding-predicate-update Modified predicates affected by biolink:binds commit 734cc6e4c42e2315b217e762bb481aff867835e7 Author: Evan Morris Date: Fri Jul 19 14:32:02 2024 -0400 bumping parsing versions for binds refactor parsers commit ec6b7d8a7bfdfd17ec6b95354b21f17ce9079c75 Merge: e44cae6 00c6627 Author: Evan Morris Date: Fri Jul 19 14:28:50 2024 -0400 Merge pull request #233 from RobokopU24/pharos_KL/AT Improving the KL/AT in Pharos commit 00c6627485c2866d7bbfc301db55ce428bcdcf1f Author: Evan Morris Date: Fri Jul 19 14:26:48 2024 -0400 bumping parsing version commit 766c5b75dff428030bb2820547c1a3fb1521d248 Author: eKathleenCarter <163005214+eKathleenCarter@users.noreply.github.com> Date: Tue Jul 16 12:42:45 2024 -0400 Update predicates.py forgot to revert changes for interacts_with. Will keep it unmodified until further clarification. commit 4cb0188743f17168cf5e5166196bf61e52ffd82d Author: eKathleenCarter <163005214+eKathleenCarter@users.noreply.github.com> Date: Tue Jul 16 12:41:36 2024 -0400 Update predicates.py after comments Made changes to reflect comments by Chris: 1. Do not modify "ed50": f"RO:0002434" until we know where this predicate comes from. "But I guess the main point is that it makes no sense to have an ed50 between a chemical and a gene/protein. Is it possible to see whether we actually have any of these?" 2. "gi50": f"RO:0002434" -> {DGIDB}:Inhibitor 3. Do not modify "interacts_with" until we know where this predicate comes from. I" guess it would be easier to be sure if we know the context in which "interacts-with" was found in our ingestss. (but I'd be tempted to put 2436 for this one)" Main take away: where do these predicates come from? Can we track down how this was generated? commit 711ac5af9d19ae6bdeb7c24c1b54dd836db3cfa5 Author: Kathleen Date: Wed Jul 10 15:10:28 2024 -0400 Improving the KL/AT in Phaors commit a687f3066bd7a1fb8a17897d2b7805f546f9aa57 Author: Kathleen Date: Wed Jul 10 12:32:26 2024 -0400 Modified predicates affected by biolink:binds commit e44cae6e8ff2bcff2ce08a57a8749997241107a1 Merge: 7ba9538 fa8ebf0 Author: Evan Morris Date: Wed Jun 26 16:38:02 2024 -0400 Merge pull request #231 from RobokopU24/requirements.txt-patch Update requirements.txt commit fa8ebf05e3761cb0d563818928857d0c8af3a2d2 Author: beasleyjonm <85600465+beasleyjonm@users.noreply.github.com> Date: Wed Jun 26 16:37:03 2024 -0400 Update requirements.txt Updated requests 2.32.0 -> 2.32.3 commit ef231fcf2be790bab0ff3360ee39796c27859d7f Author: beasleyjonm Date: Fri May 24 14:01:06 2024 -0400 Refactored treats edges for DrugCentral to differentiate between indications, off-label use, prevention, and treatment of symptoms. commit ccd0e57fce306752a945e9974e59858f8434da5c Author: Evan Morris Date: Tue Jul 23 15:27:07 2024 -0400 increasing connection pool size and retry attempts for node norm commit 8d8b643284e70e23c6bb5e2bb48425c9bc949ee4 Merge: 1f01b43 b0cf278 Author: Evan Morris Date: Tue Jul 23 15:01:08 2024 -0400 Merge pull request #230 from RobokopU24/drugcentral_treats_refactor Refactored treats edges for DrugCentral to differentiate between indi… commit b0cf278e02ffffd2b64834a8f57894dcd7adc72e Author: eKathleenCarter <163005214+eKathleenCarter@users.noreply.github.com> Date: Tue Jul 23 12:04:39 2024 -0400 Update loaddrugcentral.py kathleen modifications changed the mapping of symptomatic treatment to RO:0002606 (instead of RO:0003307) changed the mapping of diagnosis to DrugCentral:5271 commit 9f1966036b2de2a2e7f6242a4fb4e545393a9b25 Author: Evan Morris Date: Tue Jul 23 11:36:10 2024 -0400 improving error catching and logging for node normalization commit 7731c4a72d5c6e55cb037c66432f02f074b939f5 Author: Evan Morris Date: Tue Jul 23 01:48:49 2024 -0400 implemented concurrent calls to node normalizer for performance, using requests sessions for retries and performance, cleaned up comments commit 4bdeb7dc99082aa5fcfe7ac30c478f68027283a7 Author: Evan Morris Date: Tue Jul 23 01:37:56 2024 -0400 implemented better timeouts, retries, and failure for determining latest version, changed so that downloaded data files wont have version in their name commit 6bc0270ef8a66cc7be6b5eb231c5e95ac2d691ba Author: Evan Morris Date: Fri Jul 19 16:44:32 2024 -0400 Squashed commit of the following: commit 1f01b43a842706e9cb5723c88c3bbd8371ff505a Merge: ec6b7d8 734cc6e Author: Evan Morris Date: Fri Jul 19 14:38:15 2024 -0400 Merge pull request #232 from RobokopU24/binding-predicate-update Modified predicates affected by biolink:binds commit 734cc6e4c42e2315b217e762bb481aff867835e7 Author: Evan Morris Date: Fri Jul 19 14:32:02 2024 -0400 bumping parsing versions for binds refactor parsers commit ec6b7d8a7bfdfd17ec6b95354b21f17ce9079c75 Merge: e44cae6 00c6627 Author: Evan Morris Date: Fri Jul 19 14:28:50 2024 -0400 Merge pull request #233 from RobokopU24/pharos_KL/AT Improving the KL/AT in Pharos commit 00c6627485c2866d7bbfc301db55ce428bcdcf1f Author: Evan Morris Date: Fri Jul 19 14:26:48 2024 -0400 bumping parsing version commit 766c5b75dff428030bb2820547c1a3fb1521d248 Author: eKathleenCarter <163005214+eKathleenCarter@users.noreply.github.com> Date: Tue Jul 16 12:42:45 2024 -0400 Update predicates.py forgot to revert changes for interacts_with. Will keep it unmodified until further clarification. commit 4cb0188743f17168cf5e5166196bf61e52ffd82d Author: eKathleenCarter <163005214+eKathleenCarter@users.noreply.github.com> Date: Tue Jul 16 12:41:36 2024 -0400 Update predicates.py after comments Made changes to reflect comments by Chris: 1. Do not modify "ed50": f"RO:0002434" until we know where this predicate comes from. "But I guess the main point is that it makes no sense to have an ed50 between a chemical and a gene/protein. Is it possible to see whether we actually have any of these?" 2. "gi50": f"RO:0002434" -> {DGIDB}:Inhibitor 3. Do not modify "interacts_with" until we know where this predicate comes from. I" guess it would be easier to be sure if we know the context in which "interacts-with" was found in our ingestss. (but I'd be tempted to put 2436 for this one)" Main take away: where do these predicates come from? Can we track down how this was generated? commit 711ac5af9d19ae6bdeb7c24c1b54dd836db3cfa5 Author: Kathleen Date: Wed Jul 10 15:10:28 2024 -0400 Improving the KL/AT in Phaors commit a687f3066bd7a1fb8a17897d2b7805f546f9aa57 Author: Kathleen Date: Wed Jul 10 12:32:26 2024 -0400 Modified predicates affected by biolink:binds commit e44cae6e8ff2bcff2ce08a57a8749997241107a1 Merge: 7ba9538 fa8ebf0 Author: Evan Morris Date: Wed Jun 26 16:38:02 2024 -0400 Merge pull request #231 from RobokopU24/requirements.txt-patch Update requirements.txt commit fa8ebf05e3761cb0d563818928857d0c8af3a2d2 Author: beasleyjonm <85600465+beasleyjonm@users.noreply.github.com> Date: Wed Jun 26 16:37:03 2024 -0400 Update requirements.txt Updated requests 2.32.0 -> 2.32.3 commit 1f01b43a842706e9cb5723c88c3bbd8371ff505a Merge: ec6b7d8 734cc6e Author: Evan Morris Date: Fri Jul 19 14:38:15 2024 -0400 Merge pull request #232 from RobokopU24/binding-predicate-update Modified predicates affected by biolink:binds commit 734cc6e4c42e2315b217e762bb481aff867835e7 Author: Evan Morris Date: Fri Jul 19 14:32:02 2024 -0400 bumping parsing versions for binds refactor parsers commit ec6b7d8a7bfdfd17ec6b95354b21f17ce9079c75 Merge: e44cae6 00c6627 Author: Evan Morris Date: Fri Jul 19 14:28:50 2024 -0400 Merge pull request #233 from RobokopU24/pharos_KL/AT Improving the KL/AT in Pharos commit 00c6627485c2866d7bbfc301db55ce428bcdcf1f Author: Evan Morris Date: Fri Jul 19 14:26:48 2024 -0400 bumping parsing version commit 766c5b75dff428030bb2820547c1a3fb1521d248 Author: eKathleenCarter <163005214+eKathleenCarter@users.noreply.github.com> Date: Tue Jul 16 12:42:45 2024 -0400 Update predicates.py forgot to revert changes for interacts_with. Will keep it unmodified until further clarification. commit 4cb0188743f17168cf5e5166196bf61e52ffd82d Author: eKathleenCarter <163005214+eKathleenCarter@users.noreply.github.com> Date: Tue Jul 16 12:41:36 2024 -0400 Update predicates.py after comments Made changes to reflect comments by Chris: 1. Do not modify "ed50": f"RO:0002434" until we know where this predicate comes from. "But I guess the main point is that it makes no sense to have an ed50 between a chemical and a gene/protein. Is it possible to see whether we actually have any of these?" 2. "gi50": f"RO:0002434" -> {DGIDB}:Inhibitor 3. Do not modify "interacts_with" until we know where this predicate comes from. I" guess it would be easier to be sure if we know the context in which "interacts-with" was found in our ingestss. (but I'd be tempted to put 2436 for this one)" Main take away: where do these predicates come from? Can we track down how this was generated? commit 711ac5af9d19ae6bdeb7c24c1b54dd836db3cfa5 Author: Kathleen Date: Wed Jul 10 15:10:28 2024 -0400 Improving the KL/AT in Phaors commit a687f3066bd7a1fb8a17897d2b7805f546f9aa57 Author: Kathleen Date: Wed Jul 10 12:32:26 2024 -0400 Modified predicates affected by biolink:binds commit e44cae6e8ff2bcff2ce08a57a8749997241107a1 Merge: 7ba9538 fa8ebf0 Author: Evan Morris Date: Wed Jun 26 16:38:02 2024 -0400 Merge pull request #231 from RobokopU24/requirements.txt-patch Update requirements.txt commit fa8ebf05e3761cb0d563818928857d0c8af3a2d2 Author: beasleyjonm <85600465+beasleyjonm@users.noreply.github.com> Date: Wed Jun 26 16:37:03 2024 -0400 Update requirements.txt Updated requests 2.32.0 -> 2.32.3 commit 8ea9ecb8b36226af539f38c5b06035b862b1e73d Author: Evan Morris Date: Wed Jun 5 15:01:41 2024 -0400 switching to pypi version of robokop-genetics commit 86cf66a8d504017aa20a9120e169d420688b1171 Author: Evan Morris Date: Tue Jun 4 12:46:18 2024 -0400 updating bl version commit 0d72f7cbe8ad2dfd2c80ed247818d13dbc4fd7c7 Author: Evan Morris Date: Tue May 28 12:02:05 2024 -0400 updating release actions to new versions commit 56782293530a63f5b4e665b962e8087c141fe180 Author: Evan Morris Date: Tue May 28 12:00:17 2024 -0400 updating dockerfile and docker compose for neo4j 5, making docker platform an env var instead of hard coded commit 905d6f71d61ab6d362e01ab40fb4d53d2c5f0c9d Author: Evan Morris Date: Tue May 28 11:09:30 2024 -0400 updating neo4j tools for neo4j 5 commit 2bad6d329c6a27973b148e3b88f12bf689b11705 Author: Evan Morris Date: Tue May 28 11:08:47 2024 -0400 making it so that specific subgraph sources matter for determining whether on-disk merge is needed commit 1a2a5fb3cd45650025054f6e3b7060a15f37adb5 Author: Evan Morris Date: Tue May 28 11:06:03 2024 -0400 moved NormalizationScheme into normalization file, altered subgraph metadata to be a metadata object not just dict commit ef231fcf2be790bab0ff3360ee39796c27859d7f Author: beasleyjonm Date: Fri May 24 14:01:06 2024 -0400 Refactored treats edges for DrugCentral to differentiate between indications, off-label use, prevention, and treatment of symptoms. commit cc8171777b143e07014e15ed0cd5ede5106eccd6 Author: Evan Morris Date: Tue May 21 15:29:50 2024 -0400 bumping versions of all dependencies commit 7ba9538988840ab456d30afec417601ce1eb18a6 Merge: 69a9d5c 4a0236e Author: Evan Morris Date: Tue May 21 14:46:17 2024 -0400 Merge pull request #229 from RobokopU24/dependabot/pip/requests-2.32.0 Bump requests from 2.31.0 to 2.32.0 commit 4a0236ee6d89c8dcf802a0e2b78e77f826d7ee44 Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue May 21 08:27:19 2024 +0000 --- updated-dependencies: - dependency-name: requests dependency-type: direct:production ... Signed-off-by: dependabot[bot] commit 69a9d5c22ef84815296e344777cefdab44ec681c Merge: 67a5dbf 3cfa692 Author: Evan Morris Date: Fri May 17 10:11:55 2024 -0400 Merge pull request #228 from RobokopU24/deployment-updates Deployment updates commit 3cfa6921b2d61e343fd83a5cfb9e783fdde577d1 Author: Evan Morris Date: Tue May 14 13:11:21 2024 -0400 adding error checking for output_format==None commit 3af2ee240e04eb3e6e2df449a7a267a02591a431 Author: Evan Morris Date: Tue May 14 12:48:01 2024 -0400 improving output_format parsing commit 2bf5721b3512d101be67c5c72f3080756cd2520d Author: Evan Morris Date: Tue May 14 12:46:29 2024 -0400 removing jsonl which does nothing commit bc3779b8788a0347aea93d1a9443d5b135edb6ce Author: Evan Morris Date: Tue May 14 12:45:22 2024 -0400 fixing ubergraph source id commit 55b48c3fd76743eaed5142e183a08bd4febb0943 Author: Evan Morris Date: Mon May 13 17:34:48 2024 -0400 adding redundant jsonl output commit d0cf880fb631044017c20cbb285581eeefca2c1d Author: Evan Morris Date: Mon May 13 17:31:49 2024 -0400 adding option for redundant edge kg file output to the graph spec commit aa8125bc00cd42bb6f3862077baffc65ee8c676e Author: Evan Morris Date: Mon May 13 17:23:55 2024 -0400 moving redundant kg generation into Common commit 132221ab94d5b5b270119daf24a95e069ddeeec2 Author: Evan Morris Date: Mon May 13 17:13:45 2024 -0400 bail before generating dumps on QC failures, but don't fail based on infores identifiers commit 045829105f57cda4adae2297e525f3fe865f662d Author: Evan Morris Date: Fri May 10 16:54:59 2024 -0400 adding rule mining kg commit a26349fc50669e1428597fec2125e2a22ef0759c Author: Evan Morris Date: Fri May 10 16:51:13 2024 -0400 pinning pharos version in the parser instead of graph specs commit fac5c009446f86fb150ddd480c36d681b17e9e7e Author: Evan Morris Date: Fri May 10 16:37:49 2024 -0400 adding dug graph spec commit d1a3612ebfe7deb965846d9f78bb37dfe62fe20e Author: Evan Morris Date: Fri May 10 16:37:41 2024 -0400 adding a slim resources values option commit 26aabc369e084c8820d67a3ff88c0774896ca366 Author: Evan Morris Date: Fri May 10 16:36:59 2024 -0400 updating image and biolink versions commit 67a5dbfdb2ac18fbe31466adc6c64835b10180b9 Merge: 63f33bd adf4b0f Author: Evan Morris Date: Fri May 10 15:59:41 2024 -0400 Merge pull request #225 from RobokopU24/subclass_loops Subclass loops and qualifier merging commit 63f33bd1cc43d73f62833b45a5829583177d8675 Merge: 16b24ca 78eb0e0 Author: Evan Morris Date: Fri May 10 15:59:19 2024 -0400 Merge pull request #224 from RobokopU24/yobokop_go2gene_fix Fixed broken link for SGD Gene2GOTerm download file. commit 78eb0e0acc6f3b07df46b80543150a6eaeb7135f Author: Evan Morris Date: Fri May 10 15:49:01 2024 -0400 adding drugmechdb to baseline in yobokop commit 0de4d26d24a8db2a18156de3834084a02b54e3c6 Author: beasleyjonm Date: Thu May 9 13:29:23 2024 -0400 Fixed broken link for SGD Gene2GOTerm download file. commit adf4b0f0c4f35e2b91531550ffe75d61b598e4e3 Author: Evan Morris Date: Fri May 3 15:53:24 2024 -0400 added knowledge level and agent type to pharos commit 5f3e996d3621a834c38806183cf119441e36f2eb Author: Evan Morris Date: Fri May 3 14:32:12 2024 -0400 added test for edge merging with qualifiers commit 20220ecce517a73ba423ce6b70a196b5605c218e Author: Evan Morris Date: Fri May 3 13:33:50 2024 -0400 implementation for having edge merging include qualifiers in matching key commit c54edc861c32f73c53838841387d02d46c5ee823 Author: Evan Morris Date: Fri May 3 13:28:18 2024 -0400 implementing removal of subclass_of loop edges commit 45d6ced3ade824e3083b0f2936a23b754bb62b80 Author: Evan Morris Date: Fri May 3 12:55:01 2024 -0400 making public instance of drugcentral the default in the helm chart commit 16b24ca93f82886251cba385f1c390c9ad6ba4d3 Merge: 6149183 1aec57c Author: Evan Morris Date: Fri May 3 12:34:32 2024 -0400 Merge pull request #221 from RobokopU24/molepro A PR with way too many things in it commit 1aec57c8d6eefa7cfae791d9e271ee8363643fc9 Author: Evan Morris Date: Thu May 2 14:20:38 2024 -0400 removing specific version from logging error which conflicted with the actual default commit 8168864c31f9468892c85945ddbf296dd5ce6632 Author: Evan Morris Date: Wed May 1 16:14:39 2024 -0400 fixed missing prefix bug commit df44bc62f71d6596bdba04453c4c5a0a12a18a20 Author: Evan Morris Date: Fri Apr 12 15:15:03 2024 -0400 bumping parsing version commit 5968f3e44680a0e268987fe4a30dea524df89edd Author: Evan Morris Date: Fri Apr 12 15:14:38 2024 -0400 fixing agent type typo commit 55b99b5ab7dba01626d8ca8df86062b8f7a571bf Author: Evan Morris Date: Fri Apr 12 00:44:36 2024 -0400 removing biolink prefix from edge and node properties, removing properties with no values from headers, improved comments commit 759c5134129d445966631c30223cb7aa75789636 Author: Evan Morris Date: Fri Apr 12 00:23:52 2024 -0400 adding some new env vars to the docker-compose commit 335b91c22da617671ac0d99697656f24cfe73981 Author: Evan Morris Date: Fri Apr 12 00:13:42 2024 -0400 changing args for neo4j_dump so that the output directory and sources can be in different places commit 26c4aa68f0ac1cb631524741fc6f4639f1c7b854 Author: Evan Morris Date: Thu Apr 11 11:25:45 2024 -0400 bumping vp parsing version commit 81d790f5206c078e0a9ed88be1839d735f57b1fe Author: Evan Morris Date: Thu Apr 11 11:22:28 2024 -0400 moving get_goa_edge_properties back outside of the class (to fix viral proteome) commit fd9033a0505204d8c85e09621d7b4a37a63516cc Author: Evan Morris Date: Thu Apr 11 10:08:14 2024 -0400 fixing leftover staticmethod declaration commit d3d57396fa5ab0d263995205387795330b4d5e3e Author: Evan Morris Date: Thu Apr 11 04:11:20 2024 -0400 added publication nodes and edges commit 4eca7beaf523ec821275f4807d0dacea7f8e1f5d Author: Evan Morris Date: Thu Apr 11 04:10:30 2024 -0400 added knowledge level and agent type, commented out extra gene groups file download commit 24cebabeffba8cafd88a2a2c1ad58301678d9348 Author: Evan Morris Date: Thu Apr 11 04:09:57 2024 -0400 fixed bug with backwards increases_transport_of edges, added knowledge level and agent type commit 555fb00cb5c5d45bd9f968c85022ea6c5f345734 Author: Evan Morris Date: Thu Apr 11 04:09:27 2024 -0400 fixed missing publications and qualified_predicate bug, added knowledge level and agent type commit b62a969efa7d274dce4f2b3b2907f9b8efdf7e6b Author: Evan Morris Date: Thu Apr 11 04:08:46 2024 -0400 adding knowledge level and agent type, using some more constants commit 214b104d3128279bd16cbecde1124e4d01d87be3 Author: Evan Morris Date: Thu Apr 11 04:08:20 2024 -0400 adding knowledge level and agent type commit 00503f9e4f7e98b60f1b821027314222a259a706 Author: Evan Morris Date: Thu Apr 11 04:06:29 2024 -0400 added evidence code to knowledge level and agent type mapping commit 2359a69ba0ad1d5ce29089f18a28cd933ea4c932 Author: Evan Morris Date: Thu Apr 11 04:06:03 2024 -0400 adding knowledge level and agent type, switching to use some constants commit 08faa263f2c265b6e619648d606a03fa716c638b Author: Evan Morris Date: Thu Apr 11 04:05:34 2024 -0400 adding ability to fetch data from public DB, bumping source data version, adding knowledge level and agent type, adding drugbank knowledge source mapping and logging unmapped ks commit 3e1215452846b4a6a8e92b15e1140bf3761c734e Author: Evan Morris Date: Thu Apr 11 04:04:05 2024 -0400 adding knowledge level and agent type, general clean up commit b9b07a824f1ec098d12689e993b0c3f25424dc25 Author: Evan Morris Date: Thu Apr 11 04:03:04 2024 -0400 adding knowledge level and agent type commit 7785179dd4b6959534a31079f2c6924f9c0e5323 Author: Evan Morris Date: Thu Apr 11 04:00:49 2024 -0400 adding knowledge level and agent type, a bit of general clean up commit b679768bfbda389b5580518f7fe3697e53ae3a78 Author: Evan Morris Date: Thu Apr 11 03:59:19 2024 -0400 fixing extra quote and commenting out drugmechdb until we decide to put it on automat commit e16628d0dfc8793f8d269dbe8bc791311bed8230 Author: Evan Morris Date: Thu Apr 11 03:56:25 2024 -0400 adding predicate counts organized by knowledge source to metadata commit 8df1571747c2f7cad4ac42f9af725c8ae464f98e Merge: cb3c2d2 fc1786e Author: Evan Morris Date: Thu Apr 11 01:35:51 2024 -0400 Merge remote-tracking branch 'origin/DrugMechDBfulldataparser' into molepro commit fc1786eea84efd05fa4ac0c68d78558c3e189d00 Author: beasleyjonm <85600465+beasleyjonm@users.noreply.github.com> Date: Wed Apr 10 15:08:49 2024 -0400 Update loadDrugMechDB.py Changed "qualifier_predicate" to "qualified_predicate" commit a9be99f1db28a1505c03e6787c99394e2d159355 Author: beasleyjonm <85600465+beasleyjonm@users.noreply.github.com> Date: Wed Apr 10 15:08:04 2024 -0400 Update drugmechdb_predicate_map.json Changed "qualifier_predicate" to "qualified_predicate" commit cb3c2d208e57ec28755e425ebb0aa00bece61f11 Author: Evan Morris Date: Mon Apr 8 15:53:47 2024 -0400 moving biolink prefix mapping to bl_utils, removing some obsolete workarounds for biolink prefix / IRI mappings commit 2f295b3aea4cd00b3a67cfcd2670810e0cefd56f Author: Evan Morris Date: Mon Apr 8 11:08:54 2024 -0400 fixed data_services env var commit 63aac814b8b28c49afb010edfe754dc96376da08 Author: beasleyjonm Date: Mon Apr 8 00:56:02 2024 -0400 Updated parser to consolidate all relevant drugmechdb path ids into list on unique edges. commit c16aa7044ec10376f1eb5eaefeed8b78ca56f2e3 Author: beasleyjonm Date: Sun Apr 7 01:04:17 2024 -0400 Updated parser to parse every edge in the original db. Manually mapped un-normalized ids. commit 632db5ca14a874055b25e689103778c7421d29db Author: Evan Morris Date: Fri Apr 5 17:35:49 2024 -0400 adding knowledge level and agent type commit 51881ade495048891786ba2d6259ad54d2b53e24 Author: Evan Morris Date: Tue Mar 26 15:08:57 2024 -0400 updating some requirement versions and changing everything to specific pinned versions commit 2a00d29b7ed8fd29623c5d8450b48696d980a386 Author: Evan Morris Date: Tue Mar 26 15:08:27 2024 -0400 adding bl_version to helm charts and removing unnecessary neo4j password commit 9c0ea2b90aea7b32caee42b7a50d61cd0c6edb6e Author: Evan Morris Date: Tue Mar 26 15:06:54 2024 -0400 adding error checking and fallback for ssl issues commit f3b1d37bdd6a2cee195f1b0854bb4404ef3e1369 Author: Evan Morris Date: Tue Mar 26 14:19:09 2024 -0400 switching to http due to ssl errors with the https commit a83c7a8b7de40426b7e0a2f65c8e2e02057db2ec Author: Evan Morris Date: Tue Mar 26 11:50:32 2024 -0400 bumping parsing versions commit 59239948e77981a89d2711a7e4652658677b9e85 Author: Evan Morris Date: Tue Mar 26 11:49:30 2024 -0400 adding error checking for fields coming from llm that aren't strings commit 27302189a5465610188181ebd6257da91ccd198c Author: Evan Morris Date: Mon Mar 25 23:19:46 2024 -0400 updating latest image commit 88bf8fd9d40a7ce7d57917c90e079128e1f1e5dc Author: Evan Morris Date: Mon Mar 25 23:18:59 2024 -0400 removing extra entity extractor graph commit 0c245f92974752c55916f9913eb08c27b81a17e6 Author: Evan Morris Date: Mon Mar 25 23:18:08 2024 -0400 updated for new data, added abstract id to entity extractor and changed over to csv format output commit 3ee028ace65edd7678acb3563eaea3a6926e4b4c Author: Evan Morris Date: Mon Mar 25 22:34:49 2024 -0400 making a couple env vars optional, adding name res endpoint as an env var commit cde0bce149981434cbdd364512e080fd62c74947 Author: Evan Morris Date: Mon Mar 25 22:28:57 2024 -0400 fixing bug where fetch retry didn't try to fetch for the same version as it tried commit 996e8e453589deb351b60992dd126841b8d90261 Author: Evan Morris Date: Mon Mar 25 22:28:00 2024 -0400 adding litcoin from litcoin branches - merge was messy so new commits commit fcaa43902029f95e44c5153a7a8901f9d34c66ea Author: Evan Morris Date: Thu Mar 21 16:07:58 2024 -0400 updating version for monarch kg commit 27d338beda7275254503766284c6b53f7e603eda Author: Evan Morris Date: Thu Mar 21 16:06:25 2024 -0400 adding aggregator knowledge source for cam kp commit 54b8cc293370c7aee8be2209877b000fa5a612e9 Author: Evan Morris Date: Wed Mar 20 15:11:16 2024 -0400 forcing just one primary knowledge source instead of splitting commit 837c3398f6c77c8bfe1ce6d8fdeb88269ff497d7 Author: Evan Morris Date: Wed Mar 20 11:56:22 2024 -0400 updating with better defaults for sterling commit 0aa6a2eb7ed07d25611064911098d991c92bcf90 Author: Evan Morris Date: Wed Mar 20 11:55:25 2024 -0400 bumping requirements for bmt and jsonlines commit e2c462a72bd9a0a38045d877478d8709ad357179 Author: Evan Morris Date: Wed Mar 20 11:54:42 2024 -0400 making neo4j logs write to same scratch directory (hoping to solve permissions issues) commit 19423b22a67f8e1e34edbb38a75e515d18fc24a0 Author: Evan Morris Date: Wed Mar 20 11:53:33 2024 -0400 improving neo4j dump generation options and logging from cli entrypoint commit be5ba677fbc1f1e36b2de735c48235922f0bb8f7 Author: Evan Morris Date: Wed Mar 20 11:27:35 2024 -0400 cleaning up qualifier value ancestors and making sure the edge with no qualifiers gets written commit 859a9ebaf662c4d06f564e125d0e278104be9c43 Author: Evan Morris Date: Wed Mar 20 06:10:38 2024 -0400 making BL_VERSION configurable with an env var and enforced with bmt commit 49e55d1fe3ee6177202bebd0f62e7b9bd26aeffa Author: Evan Morris Date: Wed Mar 20 06:02:09 2024 -0400 refactoring for performance and cleanup, now handles cases with direction but no aspect, using constants for qualifiers, make tqdm optional commit 5574f28e6398ada65b1a8ef24870c747f591d57c Author: Evan Morris Date: Wed Mar 20 05:58:09 2024 -0400 removing extra empty properties, handling multiple primary knowledge sources commit cf541c247c16e0122c2779de2d3427056c1abc5f Author: Evan Morris Date: Wed Mar 20 05:55:13 2024 -0400 making it so that invalid infores ids don't crash normalization again commit 74f0b02db406c4f11cfc7c76d9e642ac1b6f238c Author: Evan Morris Date: Wed Mar 20 03:27:08 2024 -0400 Squashed commit of the following: commit 28fe0f5238d300c5f7fbf6c2cf10c33a26cf5b3f Author: james0032 Date: Fri Mar 8 21:44:56 2024 +0000 predicate format corrected commit b0ba22d86a3de7f20497ba4ca06031b1c9a830f4 Merge: 9bf375c 421a9a1 Author: James Chung Date: Wed Nov 22 10:55:22 2023 -0500 merge for snakify curie commit 9bf375c2caeb21c257928b03124553b27c00d790 Author: James Chung Date: Wed Nov 22 10:43:25 2023 -0500 QUALIFIED_PREDICATE commit 421a9a195daee6ff121efde6dfb5cfc2a787675c Author: Evan Morris Date: Fri Nov 17 14:15:27 2023 -0500 converting predicates from bmt get_ancestors to snakified biolink curies commit e92b623a4b0d7a509fd53e75c09b7d301f89ad46 Author: James Chung Date: Fri Aug 25 14:56:27 2023 -0400 First version of redundant graph generator commit 1ae25c96b081e518b9b45313f89b45112995a937 Author: Evan Morris Date: Wed Mar 20 03:14:32 2024 -0400 first pass at molepro parser, really the first implementation of a generic parser commit 36b84633c2e851b648cf1096b2db4819929663bf Author: Evan Morris Date: Wed Mar 20 03:13:05 2024 -0400 adding the other form of synonym property commit 373af9c763d8d9854555f7b613f34420c0fb81d0 Author: Evan Morris Date: Wed Mar 20 03:12:38 2024 -0400 adding a way to write just one normalized node commit 6a16c0ee463e8542204f9a7924ae46cfb6ad5b98 Author: Evan Morris Date: Wed Mar 20 03:12:07 2024 -0400 adding more groupings commit a0dffb34a8b032123f6fb21736556825884c6371 Author: Evan Morris Date: Wed Mar 20 01:41:59 2024 -0400 adding more node constants and rearranging some, removing biolink from knowledge source attributes commit 53b33751177973c9c97c74869ed8baa230b0ab49 Author: Evan Morris Date: Wed Mar 20 01:14:02 2024 -0400 cleaning up unnecessary defaults for nodes, this is functionally equivalent commit fda2bf2391e08bbde663e3cfd9ec3661bea5429d Author: Evan Morris Date: Wed Mar 20 01:13:22 2024 -0400 updating binding to use appropriate edge property constants commit 1f0e6f4be430b0743ea5be86bd290f3df830643d Author: Evan Morris Date: Wed Mar 20 01:12:25 2024 -0400 removing obsolete imports commit 5d0e1501aad9be35f91610bce08982820e4d061a Author: Evan Morris Date: Wed Mar 20 01:11:50 2024 -0400 reorganized constants: moved node_types.py into biolink_constants.py, added lots of edge properties, changed from using ROOT_ENTITY to just NAMED_THING --- .github/workflows/release.yml | 20 +- .gitignore | 3 + Common/biolink_constants.py | 192 ++++ Common/biolink_utils.py | 28 +- Common/build_manager.py | 42 +- Common/data_sources.py | 10 + Common/extractor.py | 2 +- Common/kgx_file_converter.py | 52 +- Common/kgx_file_merger.py | 8 +- Common/kgx_file_normalizer.py | 40 +- Common/kgx_file_writer.py | 5 +- Common/kgxmodel.py | 34 +- Common/load_manager.py | 18 +- Common/merging.py | 47 +- Common/meta_kg.py | 2 +- Common/metadata.py | 5 +- Common/neo4j_meta_kg.py | 6 +- Common/neo4j_tools.py | 64 +- Common/node_types.py | 80 -- Common/normalization.py | 249 +++-- Common/predicates.py | 38 +- Common/redundant_kg.py | 106 +++ Common/supplementation.py | 14 +- Common/utils.py | 29 +- Dockerfile | 2 +- README.md | 24 +- cli/generate_redundant_kg.py | 14 + cli/neo4j_dump.py | 27 +- docker-compose.yml | 5 +- graph_specs/ctkp-graph-spec.yaml | 9 + graph_specs/default-graph-spec.yml | 16 +- graph_specs/dug-graph-spec.yaml | 24 + graph_specs/litcoin-graph-spec.yml | 19 + graph_specs/rule-mining-graph-spec.yaml | 30 + graph_specs/yeast-graph-spec.yml | 2 +- helm/orion/renci-slim.yaml | 15 + helm/orion/renci-values.yaml | 23 +- helm/orion/templates/graph-builder.yaml | 16 +- parsers/BINDING/src/loadBINDINGDB.py | 140 +-- parsers/CTD/src/loadCTD.py | 112 +-- parsers/FooDB/src/loadFDB.py | 1 + parsers/GOA/src/loadGOA.py | 90 +- parsers/GTEx/src/loadGTEx.py | 14 +- parsers/GWASCatalog/src/loadGWASCatalog.py | 11 +- .../GenomeAlliance/src/loadGenomeAlliance.py | 2 +- parsers/IntAct/src/loadIA.py | 28 +- parsers/KinAce/src/loadKinAce.py | 1 + parsers/LitCoin/src/loadLitCoin.py | 370 ++++++++ parsers/PHAROS/src/loadPHAROS.py | 46 +- parsers/Reactome/src/loadReactome.py | 49 +- parsers/SGD/src/loadSGD.py | 4 +- parsers/SGD/src/sgd_source_retriever.py | 11 +- parsers/STRING/src/loadSTRINGDB.py | 34 +- parsers/UberGraph/src/loadUG.py | 7 +- parsers/UberGraph/src/ubergraph.py | 39 +- parsers/ViralProteome/src/loadUniRef.py | 1 + parsers/ViralProteome/src/loadVP.py | 2 +- parsers/_parser_template/src/parser.py | 2 +- parsers/camkp/src/loadCAMKP.py | 33 +- parsers/clinicaltrials/src/loadCTKP.py | 222 +++++ parsers/cord19/src/loadCord19.py | 2 +- parsers/drugcentral/src/loaddrugcentral.py | 124 ++- .../drugmechdb/src/drugmechdb_node_map.json | 858 ++++++++++++++++++ .../src/drugmechdb_predicate_map.json | 192 ++++ parsers/drugmechdb/src/loadDrugMechDB.py | 153 +++- parsers/gtopdb/src/loadGtoPdb.py | 32 +- parsers/hetio/src/loadHetio.py | 9 +- parsers/hgnc/src/loadHGNC.py | 32 +- parsers/hmdb/src/loadHMDB.py | 28 +- parsers/molepro/src/loadMolePro.py | 146 +++ parsers/monarchkg/src/loadMonarchKG.py | 37 +- parsers/panther/src/loadPanther.py | 34 +- parsers/scent/src/loadScent.py | 2 +- parsers/textminingkp/src/loadTMKP.py | 6 +- parsers/yeast/src/loadCostanza2016.py | 8 +- parsers/yeast/src/loadHistoneMap.py | 10 +- .../yeast/src/loadYeastGeneExpressionGasch.py | 2 +- .../yeast/src/loadYeastNucleosomesGSE61888.py | 2 +- requirements.txt | 32 +- set_up_test_env.sh | 15 +- tests/test_file_writer.py | 2 +- tests/test_merging.py | 55 +- tests/test_normalization.py | 20 +- 83 files changed, 3456 insertions(+), 884 deletions(-) create mode 100644 .gitignore create mode 100644 Common/biolink_constants.py delete mode 100644 Common/node_types.py create mode 100644 Common/redundant_kg.py create mode 100644 cli/generate_redundant_kg.py create mode 100644 graph_specs/ctkp-graph-spec.yaml create mode 100644 graph_specs/dug-graph-spec.yaml create mode 100644 graph_specs/litcoin-graph-spec.yml create mode 100644 graph_specs/rule-mining-graph-spec.yaml create mode 100644 helm/orion/renci-slim.yaml create mode 100644 parsers/LitCoin/src/loadLitCoin.py create mode 100644 parsers/clinicaltrials/src/loadCTKP.py create mode 100644 parsers/drugmechdb/src/drugmechdb_node_map.json create mode 100644 parsers/drugmechdb/src/drugmechdb_predicate_map.json create mode 100644 parsers/molepro/src/loadMolePro.py diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a78d4833..eb959fe5 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -13,27 +13,27 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out the repo - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Get the version id: get_version run: echo ::set-output name=VERSION::${GITHUB_REF/refs\/tags\//} - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 - with: - images: - ghcr.io/${{ github.repository }} - name: Login to ghcr - uses: docker/login-action@v1 + uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: + ghcr.io/${{ github.repository }} - name: Push to GitHub Packages - uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc + uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671 with: context: . push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} - build-args: VERSION=${{ steps.get_version.outputs.VERSION }} \ No newline at end of file + build-args: VERSION=${{ steps.get_version.outputs.VERSION }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..9783daa9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# pycache +**/__pycache__ +*.pycache diff --git a/Common/biolink_constants.py b/Common/biolink_constants.py new file mode 100644 index 00000000..15bd0a32 --- /dev/null +++ b/Common/biolink_constants.py @@ -0,0 +1,192 @@ +# A collection of constants for biolink variable names and types +# TODO it would be nice to verify these with bmt +NAMED_THING = 'biolink:NamedThing' +BIOLOGICAL_ENTITY = 'biolink:BiologicalEntity' +DISEASE_OR_PHENOTYPIC_FEATURE = 'biolink:DiseaseOrPhenotypicFeature' +DISEASE = 'biolink:Disease' +PHENOTYPIC_FEATURE = 'biolink:PhenotypicFeature' +MOLECULAR_ENTITY = 'biolink:MolecularEntity' +CHEMICAL_SUBSTANCE = 'biolink:ChemicalSubstance' +DRUG = 'biolink:Drug' +METABOLITE = 'biolink:Metabolite' +ANATOMICAL_ENTITY = 'biolink:AnatomicalEntity' +GENE = 'biolink:Gene' +GENE_PRODUCT = 'biolink:GeneProduct' +GENE_OR_GENE_PRODUCT = 'biolink:GeneOrGeneProduct' +SEQUENCE_VARIANT = 'biolink:SequenceVariant' +BIOLOGICAL_PROCESS_OR_ACTIVITY = 'biolink:BiologicalProcessOrActivity' +MOLECULAR_ACTIVITY = 'biolink:MolecularActivity' +BIOLOGICAL_PROCESS = 'biolink:BiologicalProcess' +PATHWAY = 'biolink:Pathway' +CELLULAR_COMPONENT = 'biolink:CellularComponent' +CELL = 'biolink:Cell' +GROSS_ANATOMICAL_STRUCTURE = 'biolink:GrossAnatomicalStructure' +GENETIC_CONDITION = 'biolink:GeneticCondition' +UNSPECIFIED = 'biolink:Unspecified' +GENE_FAMILY = 'biolink:GeneFamily' +GENOMIC_ENTITY = 'biolink:GenomicEntity' +FOOD = 'biolink:Food' +MACROMOLECULAR_COMPLEX = 'biolink:MacromolecularComplex' + +# properties on nodes +NODE_ID = 'id' +NODE_TYPES = 'category' +NAME = 'name' +SYNONYM = 'synonym' +TRADE_NAME = 'trade_name' +CHEMICAL_ROLE = 'chemical_role' +HAS_CHEMICAL_FORMULA = 'has_chemical_formula' +IN_TAXON = 'in_taxon' +ROUTES_OF_DELIVERY = 'routes_of_delivery' +SYMBOL = 'symbol' + + +# properties on edges +EDGE_ID = 'id' +SUBJECT_ID = 'subject' +OBJECT_ID = 'object' +PREDICATE = 'predicate' +PRIMARY_KNOWLEDGE_SOURCE = 'primary_knowledge_source' +AGGREGATOR_KNOWLEDGE_SOURCES = 'aggregator_knowledge_source' +SUPPORTING_DATA_SOURCE = 'supporting_data_source' +P_VALUE = 'p_value' +ADJUSTED_P_VALUE = 'adjusted_p_value' +AGENT_TYPE = 'agent_type' +KNOWLEDGE_LEVEL = 'knowledge_level' +MAX_RESEARCH_PHASE = 'max_research_phase' +HAS_SUPPORTING_STUDY_RESULT = 'has_supporting_study_result' + +# enums for knowledge level +KNOWLEDGE_ASSERTION = 'knowledge_assertion' +LOGICAL_ENTAILMENT = 'logical_entailment' +PREDICATION = 'prediction' +STATISTICAL_ASSOCIATION = 'statistical_association' +OBSERVATION = 'observation' +NOT_PROVIDED = 'not_provided' + +# enums for agent type +MANUAL_AGENT = 'manual_agent' +AUTOMATED_AGENT = 'automated_agent' +DATA_PIPELINE = 'data_analysis_pipeline' +COMPUTATIONAL_MODEL = 'computational_model' +TEXT_MINING_AGENT = 'text_mining_agent' +IMAGE_PROCESSING_AGENT = 'image_processing_agent' +MANUAL_VALIDATION_OF_AUTOMATED_AGENT = 'manual_validation_of_automated_agent' + +# properties that could be on edges or nodes (I think?) +DESCRIPTION = 'description' +PUBLICATIONS = 'publications' +XREFS = 'xref' + +FDA_APPROVAL_STATUS = 'highest_FDA_approval_status' +MECHANISM_OF_ACTION = 'mechanism_of_action' + +# these aren't in biolink, but we use them on edges +AFFINITY = 'affinity' +AFFINITY_PARAMETER = 'affinity_parameter' +INFORMATION_CONTENT = 'information_content' + + +# edge qualifier properties +ANATOMICAL_CONTEXT_QUALIFIER = 'anatomical_context_qualifier' +CAUSAL_MECHANISM_QUALIFIER = 'causal_mechanism_qualifier' +CONTEXT_QUALIFIER = 'context_qualifier' +DERIVATIVE_QUALIFIER = 'derivative_qualifier' +OBJECT_ASPECT_QUALIFIER = 'object_aspect_qualifier' +OBJECT_DERIVATIVE_QUALIFIER = 'object_derivative_qualifier' +OBJECT_DIRECTION_QUALIFIER = 'object_direction_qualifier' +OBJECT_FORM_OR_VARIANT_QUALIFIER = 'object_form_or_variant_qualifier' +OBJECT_PART_QUALIFIER = 'object_part_qualifier' +QUALIFIED_PREDICATE = 'qualified_predicate' +SPECIES_CONTEXT_QUALIFIER = 'species_context_qualifier' +SUBJECT_ASPECT_QUALIFIER = 'subject_aspect_qualifier' +SUBJECT_DERIVATIVE_QUALIFIER = 'subject_derivative_qualifier' +SUBJECT_DIRECTION_QUALIFIER = 'subject_direction_qualifier' +SUBJECT_FORM_OR_VARIANT_QUALIFIER = 'subject_form_or_variant_qualifier' +SUBJECT_PART_QUALIFIER = 'subject_part_qualifier' + + +# this should probably be changed to the valid biolink synonym property but don't want to break downstream tools yet +SYNONYMS = 'equivalent_identifiers' + +BIOLINK_NODE_PROPERTIES = [ + NODE_ID, + NODE_TYPES, + NAME, + DESCRIPTION, + PUBLICATIONS, + XREFS, + SYNONYM, + TRADE_NAME, + CHEMICAL_ROLE, + HAS_CHEMICAL_FORMULA, + FDA_APPROVAL_STATUS, + MECHANISM_OF_ACTION, + IN_TAXON, + ROUTES_OF_DELIVERY, + SYMBOL +] + +REQUIRED_NODE_PROPERTIES = [ + NODE_ID, + NODE_TYPES, + NAME +] + +BIOLINK_EDGE_PROPERTIES = [ + EDGE_ID, + SUBJECT_ID, + OBJECT_ID, + PREDICATE, + PRIMARY_KNOWLEDGE_SOURCE, + AGGREGATOR_KNOWLEDGE_SOURCES, + SUPPORTING_DATA_SOURCE, + PUBLICATIONS, + SYNONYMS, + DESCRIPTION, + XREFS, + P_VALUE, + ADJUSTED_P_VALUE, + AGENT_TYPE, + FDA_APPROVAL_STATUS, + KNOWLEDGE_LEVEL, + MECHANISM_OF_ACTION, + MAX_RESEARCH_PHASE, + HAS_SUPPORTING_STUDY_RESULT, + # qualifiers + ANATOMICAL_CONTEXT_QUALIFIER, + CAUSAL_MECHANISM_QUALIFIER, + CONTEXT_QUALIFIER, + DERIVATIVE_QUALIFIER, + OBJECT_ASPECT_QUALIFIER, + OBJECT_DERIVATIVE_QUALIFIER, + OBJECT_DIRECTION_QUALIFIER, + OBJECT_FORM_OR_VARIANT_QUALIFIER, + OBJECT_PART_QUALIFIER, + QUALIFIED_PREDICATE, + SPECIES_CONTEXT_QUALIFIER, + SUBJECT_ASPECT_QUALIFIER, + SUBJECT_DERIVATIVE_QUALIFIER, + SUBJECT_DIRECTION_QUALIFIER, + SUBJECT_FORM_OR_VARIANT_QUALIFIER, + SUBJECT_PART_QUALIFIER, +] + +REQUIRED_EDGE_PROPERTIES = [ + SUBJECT_ID, + OBJECT_ID, + PREDICATE, + PRIMARY_KNOWLEDGE_SOURCE +] + +BIOLINK_PROPERTIES_THAT_ARE_LISTS = [ + SYNONYMS, + SYNONYM, + NODE_TYPES, + AGGREGATOR_KNOWLEDGE_SOURCES, + PUBLICATIONS, + XREFS +] + +# biolink compliant predicates +SUBCLASS_OF = 'biolink:subclass_of' \ No newline at end of file diff --git a/Common/biolink_utils.py b/Common/biolink_utils.py index d32a6520..fc75936f 100644 --- a/Common/biolink_utils.py +++ b/Common/biolink_utils.py @@ -1,8 +1,19 @@ import requests import yaml +import os from bmt import Toolkit + +BIOLINK_MODEL_VERSION = os.environ.get("BL_VERSION", "4.1.6") +BIOLINK_MODEL_SCHEMA_URL = f"https://raw.githubusercontent.com/biolink/biolink-model/v{BIOLINK_MODEL_VERSION}/biolink-model.yaml" +PREDICATE_MAP_URL = f"https://raw.githubusercontent.com/biolink/biolink-model/v{BIOLINK_MODEL_VERSION}/predicate_mapping.yaml" + + +def get_biolink_model_toolkit(): + return Toolkit(schema=BIOLINK_MODEL_SCHEMA_URL, predicate_map=PREDICATE_MAP_URL) + + map_data = { "attribute_type_map": { "`biolink:primary_knowledge_source`": "biolink:primary_knowledge_source", @@ -26,7 +37,7 @@ class BiolinkUtils: def __init__(self): - self.toolkit = Toolkit() + self.toolkit = get_biolink_model_toolkit() def find_biolink_leaves(self, biolink_concepts: set): """ @@ -136,6 +147,21 @@ def predicate_has_qualifiers(self, predicate): return False +BIOLINK_MAPPING_CHANGES = { + 'KEGG': 'http://identifiers.org/kegg/', + 'NCBIGene': 'https://identifiers.org/ncbigene/' +} + + +def get_biolink_prefix_map(): + response = requests.get(f'https://raw.githubusercontent.com/biolink/biolink-model/v{BIOLINK_MODEL_VERSION}/project/prefixmap/biolink_model_prefix_map.json') + if response.status_code != 200: + response.raise_for_status() + biolink_prefix_map = response.json() + biolink_prefix_map.update(BIOLINK_MAPPING_CHANGES) + return biolink_prefix_map + + INFORES_STATUS_INVALID = 'invalid' INFORES_STATUS_DEPRECATED = 'deprecated' INFORES_STATUS_VALID = 'valid' diff --git a/Common/build_manager.py b/Common/build_manager.py index 20867102..0f378927 100644 --- a/Common/build_manager.py +++ b/Common/build_manager.py @@ -12,15 +12,17 @@ from Common.load_manager import SourceDataManager from Common.kgx_file_merger import KGXFileMerger from Common.neo4j_tools import create_neo4j_dump -from Common.kgxmodel import GraphSpec, SubGraphSource, DataSource, NormalizationScheme -from Common.normalization import NORMALIZATION_CODE_VERSION +from Common.kgxmodel import GraphSpec, SubGraphSource, DataSource +from Common.normalization import NORMALIZATION_CODE_VERSION, NormalizationScheme from Common.metadata import Metadata, GraphMetadata, SourceMetadata from Common.supplementation import SequenceVariantSupplementation -from Common.node_types import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, PREDICATE, PUBLICATIONS +from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, PREDICATE, PUBLICATIONS from Common.meta_kg import MetaKnowledgeGraphBuilder, META_KG_FILENAME, TEST_DATA_FILENAME +from Common.redundant_kg import generate_redundant_kg NODES_FILENAME = 'nodes.jsonl' EDGES_FILENAME = 'edges.jsonl' +REDUNDANT_EDGES_FILENAME = 'redundant_edges.jsonl' class GraphBuilder: @@ -99,8 +101,8 @@ def build_graph(self, graph_id: str): if qc_results['pass']: self.logger.info(f'QC passed for graph {graph_id}.') else: - # TODO - bail if qc fails - just need to implement a way to force output regardless - self.logger.info(f'QC failed for graph {graph_id}') + self.logger.info(f'QC failed for graph {graph_id}, bailing..') + return needs_meta_kg = not self.has_meta_kg(graph_directory=graph_output_dir) needs_test_data = not self.has_test_data(graph_directory=graph_output_dir) @@ -110,27 +112,34 @@ def build_graph(self, graph_id: str): generate_meta_kg=needs_meta_kg, generate_test_data=needs_test_data) - if 'neo4j' in graph_spec.graph_output_format.lower(): + output_formats = graph_spec.graph_output_format.lower().split('+') if graph_spec.graph_output_format else [] + nodes_filepath = os.path.join(graph_output_dir, NODES_FILENAME) + edges_filepath = os.path.join(graph_output_dir, EDGES_FILENAME) + if 'neo4j' in output_formats: self.logger.info(f'Starting Neo4j dump pipeline for {graph_id}...') - dump_success = create_neo4j_dump(graph_id=graph_id, + dump_success = create_neo4j_dump(nodes_filepath=nodes_filepath, + edges_filepath=edges_filepath, + output_directory=graph_output_dir, + graph_id=graph_id, graph_version=graph_version, - graph_directory=graph_output_dir, - nodes_filename=NODES_FILENAME, - edges_filename=EDGES_FILENAME, logger=self.logger) if dump_success: graph_output_url = self.get_graph_output_URL(graph_id, graph_version) graph_metadata.set_dump_url(f'{graph_output_url}graph_{graph_version}.db.dump') + if 'redundant_jsonl' in output_formats: + self.logger.info(f'Generating redundant edge KG for {graph_id}...') + redundant_filepath = edges_filepath.replace(EDGES_FILENAME, REDUNDANT_EDGES_FILENAME) + generate_redundant_kg(edges_filepath, redundant_filepath) + def build_dependencies(self, graph_spec: GraphSpec): for subgraph_source in graph_spec.subgraphs: subgraph_id = subgraph_source.id subgraph_version = subgraph_source.version if self.check_for_existing_graph_dir(subgraph_id, subgraph_version): # load previous metadata - graph_metadata = self.get_graph_metadata(subgraph_id, subgraph_version) - subgraph_source.graph_metadata = graph_metadata.metadata + subgraph_source.graph_metadata = self.get_graph_metadata(subgraph_id, subgraph_version) elif self.current_graph_versions[subgraph_id] == subgraph_version: self.logger.warning(f'For graph {graph_spec.graph_id} subgraph dependency ' f'{subgraph_id} version {subgraph_version} is not ready. Building now...') @@ -229,6 +238,7 @@ def run_qc(self, aggregator_knowledge_sources = set() edge_properties = set() predicate_counts = defaultdict(int) + predicate_counts_by_ks = defaultdict(lambda: defaultdict(int)) edges_with_publications = defaultdict(int) graph_edges_file_path = os.path.join(graph_directory, EDGES_FILENAME) for edge_json in quick_jsonl_file_iterator(graph_edges_file_path): @@ -239,6 +249,7 @@ def run_qc(self, for key in edge_json.keys(): edge_properties.add(key) predicate_counts[edge_json[PREDICATE]] += 1 + predicate_counts_by_ks[edge_json[PRIMARY_KNOWLEDGE_SOURCE]][edge_json[PREDICATE]] += 1 if PUBLICATIONS in edge_json and edge_json[PUBLICATIONS]: edges_with_publications[edge_json[PREDICATE]] += 1 @@ -268,7 +279,9 @@ def run_qc(self, 'pass': True, 'primary_knowledge_sources': list(primary_knowledge_sources), 'aggregator_knowledge_sources': list(aggregator_knowledge_sources), - 'predicates': {k: v for k, v in predicate_counts.items()}, + 'predicate_totals': {k: v for k, v in predicate_counts.items()}, + 'predicates_by_knowledge_source': {ks: {predicate: count for predicate, count in ks_to_p.items()} + for ks, ks_to_p in predicate_counts_by_ks.items()}, 'node_curie_prefixes': {k: v for k, v in node_curie_prefixes.items()}, 'edges_with_publications': {k: v for k, v in edges_with_publications.items()}, 'edge_properties': list(edge_properties), @@ -277,7 +290,6 @@ def run_qc(self, if deprecated_infores_ids: qc_metadata['warnings']['deprecated_knowledge_sources'] = deprecated_infores_ids if invalid_infores_ids: - qc_metadata['pass'] = False qc_metadata['warnings']['invalid_knowledge_sources'] = invalid_infores_ids return qc_metadata @@ -444,7 +456,7 @@ def get_graph_dir_path(self, graph_id: str, graph_version: str): return os.path.join(self.graphs_dir, graph_id, graph_version) def get_graph_output_URL(self, graph_id: str, graph_version: str): - graph_output_url = os.environ['ORION_OUTPUT_URL'] + graph_output_url = os.environ.get('ORION_OUTPUT_URL', "https://localhost/") if graph_output_url[-1] != '/': graph_output_url += '/' return f'{graph_output_url}{graph_id}/{graph_version}/' diff --git a/Common/data_sources.py b/Common/data_sources.py index 4cfac24a..082cf923 100644 --- a/Common/data_sources.py +++ b/Common/data_sources.py @@ -4,6 +4,7 @@ BINDING_DB = 'BINDING-DB' CAM_KP = 'CAM-KP' CHEBI_PROPERTIES = 'CHEBIProps' +CLINICAL_TRIALS_KP = 'ClinicalTrialsKP' CORD19 = 'Cord19' CTD = 'CTD' DRUG_CENTRAL = 'DrugCentral' @@ -18,7 +19,11 @@ HMDB = 'HMDB' HUMAN_GOA = 'HumanGOA' INTACT = 'IntAct' +LITCOIN = 'LitCoin' +LITCOIN_SAPBERT = 'LitCoinSapBERT' +LITCOIN_ENTITY_EXTRACTOR = 'LitCoinEntityExtractor' KINACE = 'KinAce' +MOLEPRO = 'MolePro' MONARCH_KG = 'MonarchKG' MONDO_PROPS = 'MONDOProps' ONTOLOGICAL_HIERARCHY = 'OntologicalHierarchy' @@ -47,6 +52,7 @@ BINDING_DB: ("parsers.BINDING.src.loadBINDINGDB", "BINDINGDBLoader"), CAM_KP: ("parsers.camkp.src.loadCAMKP", "CAMKPLoader"), CHEBI_PROPERTIES: ("parsers.chebi.src.loadChebiProperties", "ChebiPropertiesLoader"), + CLINICAL_TRIALS_KP: ("parsers.clinicaltrials.src.loadCTKP", "CTKPLoader"), CORD19: ("parsers.cord19.src.loadCord19", "Cord19Loader"), CTD: ("parsers.CTD.src.loadCTD", "CTDLoader"), DRUG_CENTRAL: ("parsers.drugcentral.src.loaddrugcentral", "DrugCentralLoader"), @@ -61,7 +67,11 @@ HUMAN_GOA: ("parsers.GOA.src.loadGOA", "HumanGOALoader"), HUMAN_STRING: ("parsers.STRING.src.loadSTRINGDB", "HumanSTRINGDBLoader"), INTACT: ("parsers.IntAct.src.loadIA", "IALoader"), + LITCOIN: ("parsers.LitCoin.src.loadLitCoin", "LitCoinLoader"), + LITCOIN_ENTITY_EXTRACTOR: ("parsers.LitCoin.src.loadLitCoin", "LitCoinEntityExtractorLoader"), + LITCOIN_SAPBERT: ("parsers.LitCoin.src.loadLitCoin", "LitCoinSapBERTLoader"), KINACE: ("parsers.KinAce.src.loadKinAce", "KinAceLoader"), + MOLEPRO: ("parsers.molepro.src.loadMolePro", "MoleProLoader"), MONARCH_KG: ("parsers.monarchkg.src.loadMonarchKG", "MonarchKGLoader"), MONDO_PROPS: ("parsers.MONDOProperties.src.loadMP", "MPLoader"), ONTOLOGICAL_HIERARCHY: ("parsers.UberGraph.src.loadUG", "OHLoader"), diff --git a/Common/extractor.py b/Common/extractor.py index 2054f34e..a8084550 100644 --- a/Common/extractor.py +++ b/Common/extractor.py @@ -1,7 +1,7 @@ import csv from Common.kgxmodel import kgxnode, kgxedge from Common.kgx_file_writer import KGXFileWriter -from Common.node_types import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES +from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES class Extractor: diff --git a/Common/kgx_file_converter.py b/Common/kgx_file_converter.py index df180a91..e2f8d4bf 100644 --- a/Common/kgx_file_converter.py +++ b/Common/kgx_file_converter.py @@ -2,7 +2,7 @@ import argparse from collections import defaultdict from Common.utils import quick_jsonl_file_iterator -from Common.node_types import SUBJECT_ID, OBJECT_ID, PREDICATE +from Common.biolink_constants import SUBJECT_ID, OBJECT_ID, PREDICATE def convert_jsonl_to_neo4j_csv(nodes_input_file: str, @@ -17,6 +17,8 @@ def convert_jsonl_to_neo4j_csv(nodes_input_file: str, if not edges_output_file: edges_output_file = f'{edges_input_file.rsplit(".")[0]}.csv' + # these will get converted into headers + # required properties have unique/specialized types of the following instead of normal variable types required_node_properties = { 'id': 'ID', 'name': 'string', @@ -30,6 +32,8 @@ def convert_jsonl_to_neo4j_csv(nodes_input_file: str, array_delimiter=array_delimiter) # __verify_conversion(nodes_output_file, node_properties, array_delimiter, output_delimiter) + # these will get converted into headers + # required properties have unique/specialized types of the following instead of normal variable types required_edge_properties = { SUBJECT_ID: 'START_ID', PREDICATE: 'TYPE', @@ -43,7 +47,7 @@ def convert_jsonl_to_neo4j_csv(nodes_input_file: str, array_delimiter=array_delimiter) # __verify_conversion(edges_output_file, edge_properties, array_delimiter, output_delimiter) - +""" def __verify_conversion(file_path: str, properties: dict, array_delimiter: str, @@ -82,7 +86,7 @@ def __verify_conversion(file_path: str, print(f'Not all properties were verified.. This should not happen..') print(f'Properties that were not verified: ' f'{[prop for prop in properties.keys() if prop not in verified_properties]}') - +""" def __determine_properties_and_types(file_path: str, required_properties: dict): property_type_counts = defaultdict(lambda: defaultdict(int)) @@ -90,8 +94,10 @@ def __determine_properties_and_types(file_path: str, required_properties: dict): for key, value in entity.items(): if value is None: property_type_counts[key]["None"] += 1 - if key in required_properties: - print(f'WARNING: Required property None: {entity.items()}') + if key in required_properties and key != "name": + print(f'WARNING: Required property ({key}) was None: {entity.items()}') + raise Exception( + f'None found as a value for a required property (property: {key}) in line {entity.items()}') elif isinstance(value, bool): property_type_counts[key]["boolean"] += 1 elif isinstance(value, int): @@ -118,26 +124,34 @@ def __determine_properties_and_types(file_path: str, required_properties: dict): else: property_type_counts[key]["string"] += 1 + # start with the required_properties dictionary, it has the hard coded unique types for them already properties = required_properties.copy() + properties_to_remove = [] for prop, type_counts in property_type_counts.items(): prop_types = list(type_counts.keys()) num_prop_types = len(prop_types) - if 'None' in prop_types: - print(f'WARNING: None found as a value for property {prop}, that should not happen!') - if prop in required_properties: - raise Exception(f'None found as a value for a required property - {type_counts.items()}') + # if 'None' in prop_types: + # print(f'WARNING: None found as a value for property {prop}') - if prop in required_properties and (num_prop_types > 1): + if prop in required_properties and (num_prop_types > 1) and prop != "name": + # TODO this should just enforce that required properties are the correct type, + # instead of trying to establish the type raise Exception(f'Required property {prop} had multiple conflicting types: {type_counts.items()}') elif prop in required_properties: # do nothing, already set pass elif num_prop_types == 1: - # if only one type just set it to that - properties[prop] = prop_types[0] + # if the only prop type is None that means it had no values + if prop_types[0] == "None": + # set to remove from the properties list which means it won't be in the output files + properties_to_remove.append(prop) + else: + # otherwise if only one type just set it to that + properties[prop] = prop_types[0] else: - # try to resolve the conflicts - TODO: this probably needs more work, it means a property had mixed types + # TODO: this probably needs more work + # try to resolve conflicting types, attempt to pick the type that will accommodate all of the values # print(f'Property {prop} had conflicting types: {type_counts}') if 'string[]' in prop_types: properties[prop] = 'string[]' @@ -154,7 +168,7 @@ def __determine_properties_and_types(file_path: str, required_properties: dict): else: properties[prop] = 'string' - if prop not in properties: + if prop not in properties and prop not in properties_to_remove: raise Exception(f'Property type could not be determined for: {prop}. {type_counts.items()}') # print(f'Found {len(properties)} properties:{properties.items()}') @@ -166,19 +180,22 @@ def __convert_to_csv(input_file: str, properties: dict, # dictionary of { node/edge property: property_type } array_delimiter: str, output_delimiter: str): - - headers = {prop: f'{prop}:{prop_type}' for prop, prop_type in properties.items()} + headers = {prop: f'{prop.removeprefix("biolink:")}:{prop_type}' for prop, prop_type in properties.items()} with open(output_file, 'w', newline='') as output_file_handler: csv_file_writer = csv.DictWriter(output_file_handler, delimiter=output_delimiter, fieldnames=properties, restval='', + extrasaction='ignore', quoting=csv.QUOTE_MINIMAL) csv_file_writer.writerow(headers) for item in quick_jsonl_file_iterator(input_file): for key in list(item.keys()): if item[key] is None: - del item[key] + if key == "name": + item["name"] = item["id"] + else: + del item[key] else: prop_type = properties[key] # convert lists into strings with an array delimiter @@ -195,7 +212,6 @@ def __convert_to_csv(input_file: str, if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Convert jsonl kgx files to csv neo4j import files') parser.add_argument('nodes', help='file with nodes in jsonl format') parser.add_argument('edges', help='file with edges in jsonl format') diff --git a/Common/kgx_file_merger.py b/Common/kgx_file_merger.py index 871281bf..b6d54159 100644 --- a/Common/kgx_file_merger.py +++ b/Common/kgx_file_merger.py @@ -3,7 +3,7 @@ from itertools import chain from Common.utils import LoggingUtil, quick_jsonl_file_iterator, quick_json_dumps, quick_json_loads from Common.kgxmodel import GraphSpec, SubGraphSource, DataSource -from Common.node_types import SUBJECT_ID, OBJECT_ID +from Common.biolink_constants import SUBJECT_ID, OBJECT_ID from Common.merging import GraphMerger, DiskGraphMerger, MemoryGraphMerger from Common.load_manager import RESOURCE_HOGS @@ -85,8 +85,10 @@ def merge_primary_sources(self, needs_on_disk_merge = False for graph_source in graph_sources: if isinstance(graph_source, SubGraphSource): - needs_on_disk_merge = True - break + for source_id in graph_source.graph_metadata.get_source_ids(): + if source_id in RESOURCE_HOGS: + needs_on_disk_merge = True + break elif graph_source.id in RESOURCE_HOGS: needs_on_disk_merge = True break diff --git a/Common/kgx_file_normalizer.py b/Common/kgx_file_normalizer.py index 71532188..cdd97c9e 100644 --- a/Common/kgx_file_normalizer.py +++ b/Common/kgx_file_normalizer.py @@ -3,27 +3,15 @@ import jsonlines import logging from Common.biolink_utils import BiolinkInformationResources, INFORES_STATUS_INVALID, INFORES_STATUS_DEPRECATED -from Common.node_types import SEQUENCE_VARIANT, PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, \ - PUBLICATIONS, OBJECT_ID, SUBJECT_ID, PREDICATE -from Common.normalization import NodeNormalizer, EdgeNormalizer, EdgeNormalizationResult +from Common.biolink_constants import SEQUENCE_VARIANT, PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, \ + PUBLICATIONS, OBJECT_ID, SUBJECT_ID, PREDICATE, SUBCLASS_OF +from Common.normalization import NormalizationScheme, NodeNormalizer, EdgeNormalizer, EdgeNormalizationResult, \ + NormalizationFailedError from Common.utils import LoggingUtil, chunk_iterator from Common.kgx_file_writer import KGXFileWriter -from Common.kgxmodel import NormalizationScheme from Common.merging import MemoryGraphMerger, DiskGraphMerger -class NormalizationBrokenError(Exception): - def __init__(self, error_message: str, actual_error: Exception=None): - self.error_message = error_message - self.actual_error = actual_error - - -class NormalizationFailedError(Exception): - def __init__(self, error_message: str, actual_error: Exception=None): - self.error_message = error_message - self.actual_error = actual_error - - EDGE_PROPERTIES_THAT_SHOULD_BE_SETS = {AGGREGATOR_KNOWLEDGE_SOURCES, PUBLICATIONS} NODE_NORMALIZATION_BATCH_SIZE = 1_000_000 EDGE_NORMALIZATION_BATCH_SIZE = 1_000_000 @@ -250,6 +238,7 @@ def normalize_edge_file(self): edge_splits = 0 edges_failed_due_to_nodes = 0 edges_failed_due_to_predicates = 0 + subclass_loops_removed = 0 node_norm_lookup = self.node_normalizer.node_normalization_lookup edge_norm_lookup = self.edge_normalizer.edge_normalization_lookup @@ -303,6 +292,8 @@ def normalize_edge_file(self): normalized_predicate = edge[PREDICATE] edge_inverted_by_normalization = False + # a counter for the number of normalized edges coming from a single source edge + # it's only used to determine how many edge splits occurred edge_count = 0 # ensure edge has a primary knowledge source @@ -317,6 +308,12 @@ def normalize_edge_file(self): for norm_subject_id in normalized_subject_ids: for norm_object_id in normalized_object_ids: + + # if it's a subclass_of edge, and it's a self-loop, throw it out + if normalized_predicate == SUBCLASS_OF and norm_subject_id == norm_object_id: + subclass_loops_removed += 1 + continue + edge_count += 1 # create a new edge with the normalized values @@ -341,6 +338,7 @@ def normalize_edge_file(self): # this could happen due to rare cases of normalization splits where one node normalizes to many if edge_count > 1: edge_splits += edge_count - 1 + graph_merger.merge_edges(normalized_edges) self.logger.info(f'Processed {number_of_source_edges} edges so far...') @@ -355,16 +353,9 @@ def normalize_edge_file(self): infores_status = bl_inforesources.get_infores_status(knowledge_source) if infores_status == INFORES_STATUS_DEPRECATED: deprecated_infores_ids.append(knowledge_source) - self.logger.warning(f'Normalization found a deprecated infores identifier: {knowledge_source}') elif infores_status == INFORES_STATUS_INVALID: invalid_infores_ids.append(knowledge_source) - if invalid_infores_ids: - warning_message = f'Normalization found invalid infores identifiers: {invalid_infores_ids}' - self.logger.warning(warning_message) - if self.normalization_scheme.strict: - raise NormalizationFailedError(warning_message) - try: self.logger.debug(f'Writing normalized edges to file...') with open(self.edges_output_file_path, 'w') as edges_out: @@ -397,12 +388,15 @@ def normalize_edge_file(self): # this should be true: source_edges - failures - mergers + splits = edges post norm 'edge_mergers': graph_merger.merged_edge_counter, 'edge_splits': edge_splits, + 'subclass_loops_removed': subclass_loops_removed, 'final_normalized_edges': normalized_edge_count }) if deprecated_infores_ids: self.normalization_metadata['deprecated_infores_ids'] = deprecated_infores_ids + self.logger.warning(f'Normalization found deprecated infores identifiers: {deprecated_infores_ids}') if invalid_infores_ids: self.normalization_metadata['invalid_infores_ids'] = invalid_infores_ids + self.logger.warning(f'Normalization found invalid infores identifiers: {invalid_infores_ids}') diff --git a/Common/kgx_file_writer.py b/Common/kgx_file_writer.py index cfa3cdc2..a6f09b14 100644 --- a/Common/kgx_file_writer.py +++ b/Common/kgx_file_writer.py @@ -4,7 +4,7 @@ from Common.utils import LoggingUtil from Common.kgxmodel import kgxnode, kgxedge -from Common.node_types import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, \ +from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, \ SUBJECT_ID, OBJECT_ID, PREDICATE @@ -143,6 +143,9 @@ def write_kgx_edge(self, edge: kgxedge): aggregator_knowledge_sources=edge.aggregator_knowledge_sources, edge_properties=edge.properties) + def write_normalized_edge(self, edge: dict): + self.__write_edge_to_file(edge) + def write_normalized_edges(self, edges: iter): for edge in edges: self.__write_edge_to_file(edge) diff --git a/Common/kgxmodel.py b/Common/kgxmodel.py index 61a89b47..d18c2f82 100644 --- a/Common/kgxmodel.py +++ b/Common/kgxmodel.py @@ -1,6 +1,7 @@ from dataclasses import dataclass -from Common.node_types import NAMED_THING -from Common.normalization import NORMALIZATION_CODE_VERSION +from Common.biolink_constants import NAMED_THING +from Common.metadata import GraphMetadata +from Common.normalization import NormalizationScheme class kgxnode: def __init__(self, @@ -33,31 +34,6 @@ def __init__(self, self.properties = {} -@dataclass -class NormalizationScheme: - node_normalization_version: str = 'latest' - edge_normalization_version: str = 'latest' - normalization_code_version: str = NORMALIZATION_CODE_VERSION - strict: bool = True - conflation: bool = False - - def get_composite_normalization_version(self): - composite_normalization_version = f'{self.node_normalization_version}_' \ - f'{self.edge_normalization_version}_{self.normalization_code_version}' - if self.conflation: - composite_normalization_version += '_conflated' - if self.strict: - composite_normalization_version += '_strict' - return composite_normalization_version - - def get_metadata_representation(self): - return {'node_normalization_version': self.node_normalization_version, - 'edge_normalization_version': self.edge_normalization_version, - 'normalization_code_version': self.normalization_code_version, - 'conflation': self.conflation, - 'strict': self.strict} - - @dataclass class GraphSpec: graph_id: str @@ -91,13 +67,13 @@ class GraphSource: @dataclass class SubGraphSource(GraphSource): - graph_metadata: dict = None + graph_metadata: GraphMetadata = None def get_metadata_representation(self): return {'graph_id': self.id, 'release_version': self.version, 'merge_strategy:': self.merge_strategy, - 'graph_metadata': self.graph_metadata} + 'graph_metadata': self.graph_metadata.metadata if self.graph_metadata else None} @dataclass diff --git a/Common/load_manager.py b/Common/load_manager.py index f2c5a926..9ba5aa44 100644 --- a/Common/load_manager.py +++ b/Common/load_manager.py @@ -5,9 +5,8 @@ from Common.data_sources import SourceDataLoaderClassFactory, RESOURCE_HOGS, get_available_data_sources from Common.utils import LoggingUtil, GetDataPullError -from Common.kgx_file_normalizer import KGXFileNormalizer, NormalizationBrokenError, NormalizationFailedError -from Common.kgxmodel import NormalizationScheme -from Common.normalization import NodeNormalizer, EdgeNormalizer +from Common.kgx_file_normalizer import KGXFileNormalizer +from Common.normalization import NormalizationScheme, NodeNormalizer, EdgeNormalizer, NormalizationFailedError from Common.metadata import SourceMetadata from Common.loader_interface import SourceDataBrokenError, SourceDataFailedError from Common.supplementation import SequenceVariantSupplementation, SupplementationFailedError @@ -183,7 +182,7 @@ def fetch_source(self, source_id: str, source_version: str='latest', retries: in f"{failed_error.error_message}") if retries < 2: self.logger.error(f"Retrying fetching for {source_id}.. (retry {retries + 1})") - self.fetch_source(source_id, retries=retries+1) + self.fetch_source(source_id=source_id, source_version=source_version, retries=retries+1) else: source_metadata.set_fetch_error(failed_error.error_message) source_metadata.set_fetch_status(SourceMetadata.FAILED) @@ -356,17 +355,6 @@ def normalize_source(self, normalization_status=SourceMetadata.STABLE, normalization_info=normalization_info) return True - except NormalizationBrokenError as broken_error: - error_message = f"{source_id} NormalizationBrokenError: {broken_error.error_message}" - if broken_error.actual_error: - error_message += f" - {broken_error.actual_error}" - self.logger.error(error_message) - source_metadata.update_normalization_metadata(parsing_version, - composite_normalization_version, - normalization_status=SourceMetadata.BROKEN, - normalization_error=error_message, - normalization_time=current_time) - return False except NormalizationFailedError as failed_error: error_message = f"{source_id} NormalizationFailedError: {failed_error.error_message}" if failed_error.actual_error: diff --git a/Common/merging.py b/Common/merging.py index 728f5056..ce617f0f 100644 --- a/Common/merging.py +++ b/Common/merging.py @@ -2,30 +2,51 @@ import jsonlines import secrets from xxhash import xxh64_hexdigest -from Common.node_types import * +from Common.biolink_utils import get_biolink_model_toolkit +from Common.biolink_constants import * from Common.utils import quick_json_loads, quick_json_dumps, chunk_iterator -NODE_PROPERTIES_THAT_SHOULD_BE_SETS = {SYNONYMS, NODE_TYPES} +NODE_PROPERTIES_THAT_SHOULD_BE_SETS = {SYNONYMS, NODE_TYPES, SYNONYM} EDGE_PROPERTIES_THAT_SHOULD_BE_SETS = {AGGREGATOR_KNOWLEDGE_SOURCES, PUBLICATIONS, XREFS} +bmt = get_biolink_model_toolkit() + def edge_key_function(edge): + qualifiers = [f'{key}{value}' for key, value in edge.items() if bmt.is_qualifier(key)] return xxh64_hexdigest(f'{edge[SUBJECT_ID]}{edge[PREDICATE]}{edge[OBJECT_ID]}' - f'{edge.get(PRIMARY_KNOWLEDGE_SOURCE, "")}') + f'{edge.get(PRIMARY_KNOWLEDGE_SOURCE, "")}{"".join(qualifiers)}') def entity_merging_function(entity_1, entity_2, properties_that_are_sets): - for key, value in entity_2.items(): - # TODO - make sure this is the behavior we want - - # for properties that are lists append the values - # otherwise keep the first one - if key in entity_1: - if isinstance(value, list): - entity_1[key].extend(value) - if key in properties_that_are_sets: - entity_1[key] = list(set(entity_1[key])) + # for every property of entity 2 + for key, entity_2_value in entity_2.items(): + # if entity 1 also has the property and entity_2_value is not null/empty: + # concatenate values if one is a list, otherwise ignore the property from entity 2 + if (key in entity_1) and entity_2_value: + entity_1_value = entity_1[key] + entity_1_is_list = isinstance(entity_1_value, list) + entity_2_is_list = isinstance(entity_2_value, list) + if entity_1_is_list and entity_2_is_list: + # if they're both lists just combine them + entity_1_value.extend(entity_2_value) + elif entity_1_is_list: + # if 1 is a list and 2 isn't, append the value of 2 to the list from 1 + entity_1_value.append(entity_2_value) + elif entity_2_is_list: + if entity_1_value: + # if 2 is a list and 1 has a value, add the value of 1 to the list from 2 + entity_1[key] = [entity_1_value] + entity_2_value + else: + # if 2 is a list and 1 doesn't have a value, just use the list from 2 + entity_1[key] = entity_2_value + # else: + # if neither is a list, do nothing (keep the value from 1) + if (entity_1_is_list or entity_2_is_list) and (key in properties_that_are_sets): + entity_1[key] = list(set(entity_1[key])) else: - entity_1[key] = value + # if entity 1 doesn't have the property, add the property from entity 2 + entity_1[key] = entity_2_value return entity_1 diff --git a/Common/meta_kg.py b/Common/meta_kg.py index ed6233bf..dc3a4e20 100644 --- a/Common/meta_kg.py +++ b/Common/meta_kg.py @@ -3,7 +3,7 @@ import os from collections import defaultdict -from Common.node_types import NODE_TYPES, SUBJECT_ID, OBJECT_ID, PREDICATE, PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES +from Common.biolink_constants import NODE_TYPES, SUBJECT_ID, OBJECT_ID, PREDICATE, PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES from Common.utils import quick_jsonl_file_iterator from Common.biolink_utils import BiolinkUtils diff --git a/Common/metadata.py b/Common/metadata.py index ec9bfecd..9a467f7d 100644 --- a/Common/metadata.py +++ b/Common/metadata.py @@ -3,7 +3,7 @@ import json from xxhash import xxh64_hexdigest -from Common.kgxmodel import NormalizationScheme +from Common.normalization import NormalizationScheme class Metadata: @@ -122,6 +122,9 @@ def get_build_status(self): def get_graph_version(self): return self.metadata['graph_version'] + def get_source_ids(self): + return [source['source_id'] for source in self.metadata['sources']] + class SourceMetadata(Metadata): diff --git a/Common/neo4j_meta_kg.py b/Common/neo4j_meta_kg.py index 135aea14..7db78d66 100644 --- a/Common/neo4j_meta_kg.py +++ b/Common/neo4j_meta_kg.py @@ -3,7 +3,7 @@ import os from collections import defaultdict from Common.neo4j_tools import Neo4jTools -from Common.node_types import ROOT_ENTITY +from Common.biolink_constants import NAMED_THING from Common.biolink_utils import BiolinkUtils @@ -27,8 +27,8 @@ def generate_meta_kg_and_sri_test_data(self, neo4j_access: Neo4jTools, output_di self.logger.info(f"Completed schema query ({after_time - before_time} seconds). Preparing initial schema.") schema = defaultdict(lambda: defaultdict(set)) - # avoids adding nodes with only a ROOT_ENTITY label (currently NamedThing) - filter_named_thing = lambda x: set(filter(lambda y: y != ROOT_ENTITY, x)) + # avoids adding nodes with only a NAMED_THING label (currently NamedThing) + filter_named_thing = lambda x: set(filter(lambda y: y != NAMED_THING, x)) for schema_result in schema_query_results: source_labels, predicate, target_labels = \ self.bl_utils.find_biolink_leaves(filter_named_thing(schema_result['source_labels'])), \ diff --git a/Common/neo4j_tools.py b/Common/neo4j_tools.py index 175df9da..0b3b69e6 100644 --- a/Common/neo4j_tools.py +++ b/Common/neo4j_tools.py @@ -3,7 +3,7 @@ import neo4j import subprocess import Common.kgx_file_converter as kgx_file_converter -from Common.node_types import NAMED_THING +from Common.biolink_constants import NAMED_THING from Common.utils import LoggingUtil @@ -19,7 +19,7 @@ def __init__(self, self.http_port = http_port self.https_port = https_port self.bolt_port = bolt_port - self.password = password if password else os.environ['ORION_NEO4J_PASSWORD'] + self.password = password if password else os.environ.get('ORION_NEO4J_PASSWORD', 'orion-password') self.graph_db_uri = f'bolt://{neo4j_host}:{bolt_port}' self.graph_db_auth = ("neo4j", self.password) self.neo4j_driver = neo4j.GraphDatabase.driver(self.graph_db_uri, auth=self.graph_db_auth) @@ -37,11 +37,12 @@ def import_csv_files(self, return password_exit_code self.logger.info(f'Importing csv files to neo4j...') - neo4j_import_cmd = ["neo4j-admin", "import", f"--nodes={csv_nodes_filename}", - f"--relationships={csv_edges_filename}", + neo4j_import_cmd = ['neo4j-admin', 'database', 'import', 'full', + f'--nodes={csv_nodes_filename}', + f'--relationships={csv_edges_filename}', '--delimiter=TAB', '--array-delimiter=U+001F', - '--force'] + '--overwrite-destination=true'] import_results: subprocess.CompletedProcess = subprocess.run(neo4j_import_cmd, cwd=graph_directory, capture_output=True) @@ -60,7 +61,7 @@ def load_backup_dump(self, return password_exit_code self.logger.info(f'Loading a neo4j backup dump {dump_file_path}...') - neo4j_load_cmd = ['neo4j-admin', 'load', f'--from={dump_file_path}', '--force'] + neo4j_load_cmd = ['neo4j-admin', 'database', 'load', f'--from-path={dump_file_path}', '--overwrite-destination=true', 'neo4j'] load_results: subprocess.CompletedProcess = subprocess.run(neo4j_load_cmd, capture_output=True) self.logger.info(load_results.stdout) @@ -71,10 +72,23 @@ def load_backup_dump(self, self.logger.error(error_message) return load_results_return_code + def migrate_dump_to_neo4j_5(self): + self.logger.info(f'Migrating db dump to neo4j 5...') + neo4j_migrate_cmd = ['neo4j-admin', 'database', 'migrate', '--force-btree-indexes-to-range', 'neo4j'] + migrate_results: subprocess.CompletedProcess = subprocess.run(neo4j_migrate_cmd, + capture_output=True) + self.logger.info(migrate_results.stdout) + results_return_code = migrate_results.returncode + if results_return_code != 0: + error_message = f'Neo4j migrate subprocess error (ExitCode {results_return_code}): ' \ + f'{migrate_results.stderr.decode("UTF-8")}' + self.logger.error(error_message) + return results_return_code + def create_backup_dump(self, - dump_file_path: str = None): + dump_directory: str = None): self.logger.info(f'Creating a backup dump of the neo4j...') - neo4j_dump_cmd = ['neo4j-admin', 'dump', f'--to={dump_file_path}'] + neo4j_dump_cmd = ['neo4j-admin', 'database', 'dump', 'neo4j', f'--to-path={dump_directory}'] dump_results: subprocess.CompletedProcess = subprocess.run(neo4j_dump_cmd, capture_output=True) self.logger.info(dump_results.stdout) @@ -107,7 +121,7 @@ def __issue_neo4j_command(self, command: str): def set_initial_password(self): self.logger.info('Setting initial password for Neo4j...') - neo4j_cmd = ['neo4j-admin', 'set-initial-password', self.password] + neo4j_cmd = ['neo4j-admin', 'dbms', 'set-initial-password', self.password] neo4j_results: subprocess.CompletedProcess = subprocess.run(neo4j_cmd, capture_output=True) self.logger.info(neo4j_results.stdout) @@ -139,7 +153,7 @@ def add_db_indexes(self): with self.neo4j_driver.session() as session: # node name index - node_name_index_cypher = f'CREATE INDEX node_name_index FOR (n:`{NAMED_THING}`) on (n.name)' + node_name_index_cypher = f'CREATE INDEX node_name_index FOR (n:`{NAMED_THING}`) ON (n.name)' self.logger.info(f'Adding node name index on {NAMED_THING}.name') session.run(node_name_index_cypher).consume() indexes_added += 1 @@ -151,8 +165,8 @@ def add_db_indexes(self): self.logger.info(f'Adding node id indexes for node labels: {node_labels}') for node_label in node_labels: node_label_index = f'node_id_{node_label.replace(":", "_")}' - node_name_index_cypher = f'CREATE CONSTRAINT {node_label_index} ON (n:`{node_label}`) ' \ - f'ASSERT n.id IS UNIQUE' + node_name_index_cypher = f'CREATE CONSTRAINT {node_label_index} FOR (n:`{node_label}`) ' \ + f'REQUIRE n.id IS UNIQUE' session.run(node_name_index_cypher).consume() indexes_added += 1 index_names.append(node_label_index) @@ -205,32 +219,34 @@ def close(self): self.neo4j_driver.close() -def create_neo4j_dump(graph_directory: str, - nodes_filename: str = 'nodes.jsonl', - edges_filename: str = 'edges.jsonl', +def create_neo4j_dump(nodes_filepath: str, + edges_filepath: str, + output_directory: str, graph_id: str = 'graph', graph_version: str = '', logger=None): - graph_nodes_file_path = os.path.join(graph_directory, nodes_filename) - graph_edges_file_path = os.path.join(graph_directory, edges_filename) nodes_csv_filename = 'nodes.temp_csv' edges_csv_filename = 'edges.temp_csv' - csv_nodes_file_path = os.path.join(graph_directory, nodes_csv_filename) - csv_edges_file_path = os.path.join(graph_directory, edges_csv_filename) + csv_nodes_file_path = os.path.join(output_directory, nodes_csv_filename) + csv_edges_file_path = os.path.join(output_directory, edges_csv_filename) if os.path.exists(csv_nodes_file_path) and os.path.exists(csv_edges_file_path): if logger: logger.info(f'CSV files were already created for {graph_id}({graph_version})') else: if logger: logger.info(f'Creating CSV files for {graph_id}({graph_version})...') - kgx_file_converter.convert_jsonl_to_neo4j_csv(nodes_input_file=graph_nodes_file_path, - edges_input_file=graph_edges_file_path, + kgx_file_converter.convert_jsonl_to_neo4j_csv(nodes_input_file=nodes_filepath, + edges_input_file=edges_filepath, nodes_output_file=csv_nodes_file_path, edges_output_file=csv_edges_file_path) if logger: logger.info(f'CSV files created for {graph_id}({graph_version})...') - graph_dump_file_path = os.path.join(graph_directory, f'graph_{graph_version}.db.dump') + # would like to do the following, but apparently you can't specify a custom name for the dump now + # graph_dump_name = f'graph_{graph_version}.neo4j5.db.dump' if graph_version else 'graph.neo4j5.db.dump' + # graph_dump_file_path = os.path.join(output_directory, graph_dump_name) + graph_dump_name = 'neo4j.dump' + graph_dump_file_path = os.path.join(output_directory, graph_dump_name) if os.path.exists(graph_dump_file_path): if logger: logger.info(f'Neo4j dump already exists for {graph_id}({graph_version})') @@ -238,7 +254,7 @@ def create_neo4j_dump(graph_directory: str, neo4j_access = Neo4jTools() try: - import_exit_code = neo4j_access.import_csv_files(graph_directory=graph_directory, + import_exit_code = neo4j_access.import_csv_files(graph_directory=output_directory, csv_nodes_filename=nodes_csv_filename, csv_edges_filename=edges_csv_filename) if import_exit_code != 0: @@ -260,7 +276,7 @@ def create_neo4j_dump(graph_directory: str, if stop_exit_code != 0: return False - dump_exit_code = neo4j_access.create_backup_dump(graph_dump_file_path) + dump_exit_code = neo4j_access.create_backup_dump(output_directory) if dump_exit_code != 0: return False diff --git a/Common/node_types.py b/Common/node_types.py deleted file mode 100644 index 55903670..00000000 --- a/Common/node_types.py +++ /dev/null @@ -1,80 +0,0 @@ -# A collection of constants for biolink variable names and types -NAMED_THING = 'biolink:NamedThing' -BIOLOGICAL_ENTITY = 'biolink:BiologicalEntity' -DISEASE_OR_PHENOTYPIC_FEATURE = 'biolink:DiseaseOrPhenotypicFeature' -DISEASE = 'biolink:Disease' -PHENOTYPIC_FEATURE = 'biolink:PhenotypicFeature' -MOLECULAR_ENTITY = 'biolink:MolecularEntity' -CHEMICAL_SUBSTANCE = 'biolink:ChemicalSubstance' -DRUG = 'biolink:Drug' -METABOLITE = 'biolink:Metabolite' -ANATOMICAL_ENTITY = 'biolink:AnatomicalEntity' -GENE = 'biolink:Gene' -GENE_PRODUCT = 'biolink:GeneProduct' -GENE_OR_GENE_PRODUCT = 'biolink:GeneOrGeneProduct' -SEQUENCE_VARIANT = 'biolink:SequenceVariant' -BIOLOGICAL_PROCESS_OR_ACTIVITY = 'biolink:BiologicalProcessOrActivity' -MOLECULAR_ACTIVITY = 'biolink:MolecularActivity' -BIOLOGICAL_PROCESS = 'biolink:BiologicalProcess' -PATHWAY = 'biolink:Pathway' -CELLULAR_COMPONENT = 'biolink:CellularComponent' -CELL = 'biolink:Cell' -GROSS_ANATOMICAL_STRUCTURE = 'biolink:GrossAnatomicalStructure' -GENETIC_CONDITION = 'biolink:GeneticCondition' -UNSPECIFIED = 'biolink:Unspecified' -GENE_FAMILY = 'biolink:GeneFamily' -GENOMIC_ENTITY = 'biolink:GenomicEntity' -FOOD = 'biolink:Food' -MACROMOLECULAR_COMPLEX = 'biolink:MacromolecularComplex' - -# The root of all biolink_model entities -ROOT_ENTITY = NAMED_THING - -# a property name for listing node types that did not normalize -CUSTOM_NODE_TYPES = 'custom_node_types' - -node_types = [ - NAMED_THING, - BIOLOGICAL_ENTITY, - DISEASE_OR_PHENOTYPIC_FEATURE, - DISEASE, - PHENOTYPIC_FEATURE, - MOLECULAR_ENTITY, - CHEMICAL_SUBSTANCE, - DRUG, - METABOLITE, - ANATOMICAL_ENTITY, - GENE, - SEQUENCE_VARIANT, - BIOLOGICAL_PROCESS_OR_ACTIVITY, - MOLECULAR_ACTIVITY, - BIOLOGICAL_PROCESS, - PATHWAY, - CELLULAR_COMPONENT, - CELL, - GROSS_ANATOMICAL_STRUCTURE, - GENETIC_CONDITION, - UNSPECIFIED, - GENE_FAMILY, - FOOD, - MACROMOLECULAR_COMPLEX -] - -# The following are used by edges: -SUBJECT_ID = 'subject' -OBJECT_ID = 'object' -PREDICATE = 'predicate' -NODE_TYPES = 'category' -SYNONYMS = 'equivalent_identifiers' -INFORMATION_CONTENT = 'information_content' -DESCRIPTION = 'description' - -FALLBACK_EDGE_PREDICATE = 'biolink:related_to' - -PRIMARY_KNOWLEDGE_SOURCE = 'biolink:primary_knowledge_source' -AGGREGATOR_KNOWLEDGE_SOURCES = 'biolink:aggregator_knowledge_source' -PUBLICATIONS = 'publications' -AFFINITY = 'affinity' -AFFINITY_PARAMETER = 'affinityParameter' -XREFS = 'xref' - diff --git a/Common/normalization.py b/Common/normalization.py index 0d8c8289..39150eeb 100644 --- a/Common/normalization.py +++ b/Common/normalization.py @@ -3,11 +3,50 @@ import requests import time +from requests.adapters import HTTPAdapter, Retry +from dataclasses import dataclass + from robokop_genetics.genetics_normalization import GeneticsNormalizer -from Common.node_types import * +from Common.biolink_constants import * from Common.utils import LoggingUtil -NORMALIZATION_CODE_VERSION = '1.1' +NORMALIZATION_CODE_VERSION = '1.2' + +# node property name for node types that did not normalize +CUSTOM_NODE_TYPES = 'custom_node_types' + +# predicate to use when normalization fails +FALLBACK_EDGE_PREDICATE = 'biolink:related_to' + +@dataclass +class NormalizationScheme: + node_normalization_version: str = 'latest' + edge_normalization_version: str = 'latest' + normalization_code_version: str = NORMALIZATION_CODE_VERSION + strict: bool = True + conflation: bool = False + + def get_composite_normalization_version(self): + composite_normalization_version = f'{self.node_normalization_version}_' \ + f'{self.edge_normalization_version}_{self.normalization_code_version}' + if self.conflation: + composite_normalization_version += '_conflated' + if self.strict: + composite_normalization_version += '_strict' + return composite_normalization_version + + def get_metadata_representation(self): + return {'node_normalization_version': self.node_normalization_version, + 'edge_normalization_version': self.edge_normalization_version, + 'normalization_code_version': self.normalization_code_version, + 'conflation': self.conflation, + 'strict': self.strict} + + +class NormalizationFailedError(Exception): + def __init__(self, error_message: str, actual_error: Exception = None): + self.error_message = error_message + self.actual_error = actual_error class NodeNormalizer: @@ -64,109 +103,95 @@ def __init__(self, self.sequence_variant_normalizer = None self.variant_node_types = None - def hit_node_norm_service(self, curies, retries=0): - resp: requests.models.Response = requests.post(f'{self.node_norm_endpoint}get_normalized_nodes', - json={'curies': curies, - 'conflate': self.conflate_node_types, - 'drug_chemical_conflate': self.conflate_node_types, - 'description': True}) + self.requests_session = self.get_normalization_requests_session() + + def hit_node_norm_service(self, curies): + resp = self.requests_session.post(f'{self.node_norm_endpoint}get_normalized_nodes', + json={'curies': curies, + 'conflate': self.conflate_node_types, + 'drug_chemical_conflate': self.conflate_node_types, + 'description': True}) if resp.status_code == 200: # if successful return the json as an object - return resp.json() - else: - error_message = f'Node norm response code: {resp.status_code}' - if resp.status_code >= 500: - # if 5xx retry 3 times - retries += 1 - if retries == 4: - error_message += ', retried 3 times, giving up..' - self.logger.error(error_message) - resp.raise_for_status() - else: - error_message += f', retrying.. (attempt {retries})' - time.sleep(retries * 3) - self.logger.error(error_message) - return self.hit_node_norm_service(curies, retries) + response_json = resp.json() + if response_json: + return response_json else: - # we should never get a legitimate 4xx response from node norm, - # crash with an error for troubleshooting - if resp.status_code == 422: - error_message += f'(curies: {curies})' - self.logger.error(error_message) - resp.raise_for_status() + error_message = f"Node Normalization service {self.node_norm_endpoint} returned 200 " \ + f"but with an empty result for (curies: {curies})" + raise NormalizationFailedError(error_message=error_message) + else: + error_message = f'Node norm response code: {resp.status_code} (curies: {curies})' + self.logger.error(error_message) + resp.raise_for_status() - def normalize_node_data(self, node_list: list, block_size: int = 1000) -> list: + def normalize_node_data(self, node_list: list, batch_size: int = 1000) -> list: """ - This method calls the NodeNormalization web service to get the normalized identifier and name of the node. - the data comes in as a node list. + This method calls the NodeNormalization web service and normalizes a list of nodes. - :param node_list: A list with items to normalize - :param block_size: the number of curies in the request + :param node_list: A list of unique nodes to normalize + :param batch_size: the number of curies to be sent to NodeNormalization at once :return: """ - self.logger.debug(f'Start of normalize_node_data. items: {len(node_list)}') - - # init the cache - this accumulates all the results from the node norm service - cached_node_norms: dict = {} - - # create a unique set of node ids - tmp_normalize: set = set([node['id'] for node in node_list]) + # look up all valid biolink node types if needed + # this is used when strict normalization is off to ensure only valid types go into the graph as NODE_TYPES + if not self.strict_normalization and not self.biolink_compliant_node_types: + biolink_lookup = EdgeNormalizer(edge_normalization_version=self.biolink_version) + self.biolink_compliant_node_types = biolink_lookup.get_valid_node_types() - # convert the set to a list so we can iterate through it - to_normalize: list = list(tmp_normalize) + # make a list of the node ids, we used to deduplicate here, but now we expect the list to be unique ids + to_normalize: list = [node['id'] for node in node_list] - # init the array index lower boundary + # use indexes and slice to grab batch_size sized chunks of ids from the list start_index: int = 0 - - # get the last index of the list last_index: int = len(to_normalize) - - self.logger.debug(f'{last_index} unique nodes found in this group.') - - # grab chunks of the data frame + chunks_of_ids = [] while True: if start_index < last_index: - # define the end index of the slice - end_index: int = start_index + block_size + end_index: int = start_index + batch_size - # force the end index to be the last index to insure no overflow + # force the end index to be no greater than the last index to ensure no overflow if end_index >= last_index: end_index = last_index - self.logger.debug(f'Working block {start_index} to {end_index}.') - - # collect a slice of records from the data frame - data_chunk: list = to_normalize[start_index: end_index] - - # hit the node norm api - normalization_json = self.hit_node_norm_service(curies=data_chunk) - if normalization_json: - # merge the normalization results with what we have gotten so far - cached_node_norms.update(**normalization_json) - else: - # this shouldn't happen but if the API returns an empty dict instead of nulls, - # assume none of the curies normalize - empty_responses = {curie: None for curie in data_chunk} - cached_node_norms.update(empty_responses) + # collect a slice of block_size curies from the full list + chunks_of_ids.append(to_normalize[start_index: end_index]) # move on down the list - start_index += block_size + start_index += batch_size else: break + # we should be able to do the following, but it's causing RemoteDisconnected errors with node norm + # + # hit the node norm api with the chunks of curies in parallel + # we could try to optimize the number of max_workers for ThreadPoolExecutor more specifically, + # by default python attempts to find a reasonable # based on os.cpu_count() + # with ThreadPoolExecutor() as executor: + # executor_results = executor.map(self.hit_node_norm_service, chunks_of_ids) + # + # normalization_results = list(executor_results) + # for normalization_json, ids in zip(normalization_results, chunks_of_ids): + # if not normalization_json: + # raise NormalizationFailedError(f'!!! Normalization json results missing for ids: {ids}') + # else: + # merge the normalization results into one dictionary + # node_normalization_results.update(**normalization_json) + + # until we can get threading working, hit node norm sequentially + node_normalization_results: dict = {} + for chunk in chunks_of_ids: + results = self.hit_node_norm_service(chunk) + node_normalization_results.update(**results) + # reset the node index node_idx = 0 # node ids that failed to normalize failed_to_normalize: list = [] - # look up valid node types if needed - if not self.strict_normalization and not self.biolink_compliant_node_types: - biolink_lookup = EdgeNormalizer(edge_normalization_version=self.biolink_version) - self.biolink_compliant_node_types = biolink_lookup.get_valid_node_types() - # for each node update the node with normalized information # store the normalized IDs in self.node_normalization_lookup for later look up while node_idx < len(node_list): @@ -192,7 +217,7 @@ def normalize_node_data(self, node_list: list, block_size: int = 1000) -> list: if not self.strict_normalization: if NODE_TYPES not in current_node: - current_node[NODE_TYPES] = [ROOT_ENTITY] + current_node[NODE_TYPES] = [NAMED_THING] # remove all the bad types and make them a property instead invalid_node_types = [node_type for node_type in current_node[NODE_TYPES] if @@ -203,15 +228,15 @@ def normalize_node_data(self, node_list: list, block_size: int = 1000) -> list: # keep all the valid types current_node[NODE_TYPES] = [node_type for node_type in current_node[NODE_TYPES] if node_type in self.biolink_compliant_node_types] - # add the ROOT ENTITY type if it's not there - if ROOT_ENTITY not in current_node[NODE_TYPES]: - current_node[NODE_TYPES].append(ROOT_ENTITY) + # add the NAMED_THING type if it's not there + if NAMED_THING not in current_node[NODE_TYPES]: + current_node[NODE_TYPES].append(NAMED_THING) # enforce that the list is really a set current_node[NODE_TYPES] = list(set(current_node[NODE_TYPES])) # did we get a response from the normalizer - current_node_normalization = cached_node_norms[current_node_id] + current_node_normalization = node_normalization_results[current_node_id] if current_node_normalization is not None: current_node_id_section = current_node_normalization['id'] @@ -335,6 +360,17 @@ def get_current_node_norm_version(self): # this shouldn't happen, raise an exception resp.raise_for_status() + @staticmethod + def get_normalization_requests_session(): + pool_maxsize = max(os.cpu_count(), 10) + s = requests.Session() + retries = Retry(total=8, + backoff_factor=1, + status_forcelist=[502, 503, 504, 403, 429]) + s.mount('https://', HTTPAdapter(max_retries=retries, pool_maxsize=pool_maxsize)) + s.mount('http://', HTTPAdapter(max_retries=retries, pool_maxsize=pool_maxsize)) + return s + class EdgeNormalizationResult: def __init__(self, @@ -392,10 +428,8 @@ def normalize_edge_data(self, """ # find the predicates that have not been normalized yet - predicates_to_normalize = set() - for edge in edge_list: - if edge[PREDICATE] not in self.edge_normalization_lookup: - predicates_to_normalize.add(edge[PREDICATE]) + predicates_to_normalize = {edge[PREDICATE] for edge in edge_list + if edge[PREDICATE] not in self.edge_normalization_lookup} # convert the set to a list so we can iterate through it predicates_to_normalize_list = list(predicates_to_normalize) @@ -514,7 +548,7 @@ def check_node_type_valid(self, node_type: str): def get_valid_node_types(self): # call the descendants endpoint with the root node type - edge_norm_descendants_url = f'{self.edge_norm_endpoint}bl/{ROOT_ENTITY}/descendants?version={self.edge_norm_version}' + edge_norm_descendants_url = f'{self.edge_norm_endpoint}bl/{NAMED_THING}/descendants?version={self.edge_norm_version}' resp: requests.models.Response = requests.get(edge_norm_descendants_url) # did we get a good status code @@ -525,3 +559,50 @@ def get_valid_node_types(self): else: # this shouldn't happen, raise an exception resp.raise_for_status() + + +NAME_RESOLVER_URL = os.environ.get('NAME_RESOLVER_ENDPOINT', "https://name-resolution-sri.renci.org/") + 'lookup' +NAME_RESOLVER_HEADERS = {"accept": "application/json"} +NAME_RESOLVER_API_ERROR = 'api_error' + + +def call_name_resolution(name: str, biolink_type: str, retries=0, logger=None): + nameres_payload = { + "string": name, + "biolink_type": biolink_type if biolink_type else "", + "autocomplete": False + } + try: + # logger.info(f'About to call name res..') + nameres_result = requests.get(NAME_RESOLVER_URL, + params=nameres_payload, + headers=NAME_RESOLVER_HEADERS, + timeout=45) + # logger.info(f'Got result from name res {nameres_result.status_code}') + if nameres_result.status_code == 200: + # return the first result if there is one + nameres_json = nameres_result.json() + # logger.info(f'Unpacked json..') + return nameres_json[0] if nameres_json else None + else: + error_message = f'Non-200 result from name resolution (url: {NAME_RESOLVER_URL}, ' \ + f'payload: {nameres_payload}). Status code: {nameres_result.status_code}.' + except requests.exceptions.ConnectionError as e: + error_message = f'Connection Error calling name resolution (url: {NAME_RESOLVER_URL}, ' \ + f'payload: {nameres_payload}). Error: {e}.' + except requests.exceptions.Timeout as t: + error_message = f'Calling name resolution timed out (url: {NAME_RESOLVER_URL}, ' \ + f'payload: {nameres_payload}). Error: {t}.' + + # if we get here something went wrong, log error and retry + if logger: + logger.error(error_message) + else: + print(error_message) + if retries < 2: + time.sleep(5) + logger.info('Retrying name resolution..') + return call_name_resolution(name, biolink_type, retries + 1, logger) + + # if retried 2 times already give up and return the last error + return {NAME_RESOLVER_API_ERROR: error_message} diff --git a/Common/predicates.py b/Common/predicates.py index e6fd89c2..966247cf 100644 --- a/Common/predicates.py +++ b/Common/predicates.py @@ -8,43 +8,43 @@ "allosteric_antagonist": f"{DGIDB}:antagonist", "allosteric_modulator": f"{DGIDB}:modulator", "antagonist": f"{DGIDB}:antagonist", - "antibody": f"{DGIDB}:binder", - "antibody_binding": f"{DGIDB}:binder", + "antibody": f"RO:0002436", + "antibody_binding": f"RO:0002436", "antisense_inhibitor": f"{DGIDB}:inhibitor", - "app_ki": f"RO:0002434", - "app_km": f"RO:0002434", - "binding_agent": f"{DGIDB}:binder", + "app_ki": f"RO:0002434", # apparent Ki? if so change to RO:0002436 + "app_km": f"RO:0002434", # apperent Km? if so change to RO:0002436 + "binding_agent": f"RO:0002436", "blocker": f"{DGIDB}:blocker", "channel_blocker": f"{DGIDB}:channel_blocker", "ec50": f"{DGIDB}:agonist", - "ed50": f"RO:0002434", + "ed50": f"RO:0002434", # Effective Dose. Where does this predicate come from? CB (2024_07): "it makes no sense to have an ed50 between a chemical and a gene/protein" "gating_inhibitor": f"{DGIDB}:gating_inhibitor", - "gi50": f"RO:0002434", + "gi50": f"{DGIDB}:Inhibitor", # Growth Inhibitor "ic50": f"{DGIDB}:inhibitor", "inhibitor": f"{DGIDB}:inhibitor", - "interacts_with": f"RO:0002434", + "interacts_with": f"RO:0002434", # Where does this predicate come from? Possiblely needs to be modified to RO:0002436 "inverse_agonist": f"{DGIDB}:inverse_agonist", - "ka": f"RO:0002434", - "kact": f"RO:0002434", - "kb": f"{DGIDB}:binder", - "kd": f"{DGIDB}:binder", - "kd1": f"RO:0002434", + "ka": f"RO:0002436", + "kact": f"RO:0002436", # is this a miss type of kcat? + "kb": f"RO:0002436", # {DGIDB}:binder maps to biolink:binds which is depreciated + "kd": f"RO:0002436", + "kd1": f"RO:0002436", # RO:0002434 maps to biolink:related_to "ki": f"{DGIDB}:inhibitor", - "km": f"RO:0002434", - "ks": f"RO:0002434", + "km": f"RO:0002436", + "ks": f"RO:0002436", "modulator": f"{DGIDB}:modulator", - "mic": f"RO:0002434", - "mpc": f"RO:0002434", + "mic": f"RO:0002434", # What is this referring to? + "mpc": f"RO:0002434", # What is this referring to? "negative_modulator": f"{CHEMBL_MECHANISM}:negative_modulator", "negative_allosteric_modulator": f"{CHEMBL_MECHANISM}:negative_modulator", "opener": f"{CHEMBL_MECHANISM}:opener", "other": f"{DGIDB}:other", "partial_agonist": f"{DGIDB}:partial_agonist", - "pa2": f"RO:0002434", + "pa2": f"RO:0002434", # What is this referring to? "pharmacological_chaperone": f"{DGIDB}:chaperone", "positive_allosteric_modulator": f"{CHEMBL_MECHANISM}:positive_modulator", "positive_modulator": f"{CHEMBL_MECHANISM}:positive_modulator", "releasing_agent": f"{CHEMBL_MECHANISM}:releasing_agent", "substrate": f"{CHEMBL_MECHANISM}:substrate", - "xc50": f"RO:0002434" + "xc50": f"RO:0002436" # This is related to ec50 and ic50 both of which describe binding events } diff --git a/Common/redundant_kg.py b/Common/redundant_kg.py new file mode 100644 index 00000000..38eedc34 --- /dev/null +++ b/Common/redundant_kg.py @@ -0,0 +1,106 @@ +from itertools import product +from functools import cache +try: + from tqdm import tqdm + TQDM_AVAILABLE = True +except ImportError: + TQDM_AVAILABLE = False + +from Common.biolink_utils import get_biolink_model_toolkit +from Common.biolink_constants import OBJECT_ASPECT_QUALIFIER, OBJECT_DIRECTION_QUALIFIER, SPECIES_CONTEXT_QUALIFIER, \ + QUALIFIED_PREDICATE, PREDICATE +from Common.utils import quick_jsonl_file_iterator, snakify +from Common.kgx_file_writer import KGXFileWriter + +bmt = get_biolink_model_toolkit() + +# TODO - really we should get the full list of qualifiers from Common/biolink_constants.py, +# but because we currently cannot deduce the association types of edges and/or permissible value enumerators, +# we have to hard code qualifier handling anyway, we might as well check against a smaller list +QUALIFIER_KEYS = [OBJECT_ASPECT_QUALIFIER, + OBJECT_DIRECTION_QUALIFIER] +# we do have these qualifiers but we cant do any redundancy with them so ignore for now: +# QUALIFIED_PREDICATE - +# SPECIES_CONTEXT_QUALIFIER - + + +# bmt does a lot of caching, but because we are doing the string manipulation it's prob a lot faster to cache these +@cache +def get_ancestor_predicates_biolink(predicate): + cur_predicate = predicate.split(':')[-1] + return set([f'{snakify(curie)}' for curie in bmt.get_ancestors(cur_predicate, formatted=True, reflexive=False)]) + + +def check_qualifier(ed): + qfs = [] + for k in ed.keys(): + if bmt.is_qualifier(k): + qfs.append(k) + return qfs + + +def write_edge_no_q(edge, predicate): + tmp_edge = edge.copy() + tmp_edge[PREDICATE] = f"{predicate}" + tmp_edge.pop(OBJECT_DIRECTION_QUALIFIER, None) + tmp_edge.pop(OBJECT_ASPECT_QUALIFIER, None) + tmp_edge.pop(QUALIFIED_PREDICATE, None) + return tmp_edge + + +def generate_redundant_kg(infile, edges_file_path): + + with KGXFileWriter(edges_output_file_path=edges_file_path) as kgx_file_writer: + for edge in tqdm(quick_jsonl_file_iterator(infile)) if TQDM_AVAILABLE else quick_jsonl_file_iterator(infile): + + try: + edge_predicate = edge['predicate'] + except KeyError: + print(f"Redundant Graph Failed - missing predicate on edge: {edge}") + break + + ancestor_predicates = get_ancestor_predicates_biolink(edge_predicate) + + # qualifiers = check_qualifier(edge) <- it would be better to do something like this but because we're not + # handling other qualifiers anyway it's faster to just do the following: + qualifiers = [qualifier for qualifier in QUALIFIER_KEYS if qualifier in edge] + + # The following looks up the permissible values for ancestors of the current qualfier values. + # Aspects and directions are handled slightly differently, because when we have aspect AND direction, + # you cant remove the aspect, but you can remove the direction. + + # for aspect overwrite [None] so that permutations don't include options with no aspect + aspect_values = [None] + if OBJECT_ASPECT_QUALIFIER in qualifiers: + aspect_values = bmt.get_permissible_value_ancestors(permissible_value=edge[OBJECT_ASPECT_QUALIFIER], + enum_name='GeneOrGeneProductOrChemicalEntityAspectEnum') + + # for direction include None so permutations include options with no direction + direction_values = [None] + if OBJECT_DIRECTION_QUALIFIER in qualifiers: + direction_values += bmt.get_permissible_value_ancestors(permissible_value=edge[OBJECT_DIRECTION_QUALIFIER], + enum_name='DirectionQualifierEnum') + + # permutations of permissible qualifier values and their ancestors, write an edge for each permutation + edges_to_write = [] + for (a, d) in product(aspect_values, direction_values): + edge_copy = edge.copy() + if a: + edge_copy[OBJECT_ASPECT_QUALIFIER] = a + else: + edge_copy.pop(OBJECT_ASPECT_QUALIFIER, None) + if d: + edge_copy[OBJECT_DIRECTION_QUALIFIER] = d + else: + edge_copy.pop(OBJECT_DIRECTION_QUALIFIER, None) + edges_to_write.append(edge_copy) + + # if there was an aspect qualifier, write the edge with no qualifiers because it hasn't happened yet + if OBJECT_ASPECT_QUALIFIER in qualifiers: + edges_to_write.append(write_edge_no_q(edge, edge_predicate)) + + # write an edge for every ancestor predicate of the original predicate, with no qualifiers + for ancestor_predicate in ancestor_predicates: + edges_to_write.append(write_edge_no_q(edge, ancestor_predicate)) + + kgx_file_writer.write_normalized_edges(edges_to_write) diff --git a/Common/supplementation.py b/Common/supplementation.py index 87830a38..8a27f4f1 100644 --- a/Common/supplementation.py +++ b/Common/supplementation.py @@ -7,11 +7,11 @@ from urllib.request import urlopen from zipfile import ZipFile from collections import defaultdict -from Common.node_types import SEQUENCE_VARIANT, GENE, FALLBACK_EDGE_PREDICATE +from Common.biolink_constants import * +from Common.normalization import FALLBACK_EDGE_PREDICATE, NormalizationScheme from Common.utils import LoggingUtil from Common.kgx_file_writer import KGXFileWriter from Common.kgx_file_normalizer import KGXFileNormalizer -from Common.kgxmodel import NormalizationScheme SNPEFF_SO_PREDICATES = { @@ -178,14 +178,14 @@ def convert_snpeff_to_kgx(self, annotations_to_write[effect_predicate].add(gene_curie) for effect_predicate, gene_ids in annotations_to_write.items(): for gene_id in gene_ids: + edge_props = {KNOWLEDGE_LEVEL: PREDICATION, + AGENT_TYPE: COMPUTATIONAL_MODEL} if gene_distances[gene_id]: try: - edge_props = {'distance_to_feature': int(gene_distances[gene_id])} + edge_props['distance_to_feature']: int(gene_distances[gene_id]) except ValueError: - edge_props = None - else: - edge_props = None - output_file_writer.write_node(gene_id, None, [GENE]) + pass + output_file_writer.write_node(gene_id, "", [NAMED_THING, GENE]) output_file_writer.write_edge(subject_id=variant_id, object_id=gene_id, predicate=effect_predicate, diff --git a/Common/utils.py b/Common/utils.py index 037d6a92..30ef06f1 100644 --- a/Common/utils.py +++ b/Common/utils.py @@ -3,7 +3,6 @@ import tarfile import gzip import requests -import pandas as pd import orjson from dateutil import parser as dp from itertools import islice @@ -16,7 +15,6 @@ from ftplib import FTP from datetime import datetime from logging.handlers import RotatingFileHandler -from pathlib import Path class LoggingUtil(object): @@ -266,23 +264,30 @@ def get_http_file_modified_date(self, file_url: str): self.logger.error(error_message) raise GetDataPullError(error_message) - def pull_via_http(self, url: str, data_dir: str, is_gzip=False) -> int: + def pull_via_http(self, url: str, data_dir: str, is_gzip=False, saved_file_name: str = None) -> int: """ gets the file from an http stream. :param url: :param data_dir: :param is_gzip: + :param saved_file_name: :return: the number of bytes read """ - # get the filename - data_file: str = url.split('/')[-1] + # is_gzip isn't used on the main branch, but it's probably on some branches or forks, + # lets throw this for a while, so it's not mysteriously removed + if is_gzip: + raise NotImplementedError(f'is_gzip is deprecated, unzip files during parsing not retrieval!') - # init the byte counter + # get the name of the file to write + data_file: str = saved_file_name if saved_file_name else url.split('/')[-1] + + # this tracks how much, if any, of the file is downloaded + # (it's not really used anymore, it could be more simple) byte_counter: int = 0 - # get the file if its not there + # check if the file exists already if not os.path.exists(os.path.join(data_dir, data_file)): self.logger.debug(f'Retrieving {url} -> {data_dir}') @@ -290,17 +295,9 @@ def pull_via_http(self, url: str, data_dir: str, is_gzip=False) -> int: hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'} req = request.Request(url, headers=hdr) - # get the the file data handle + # get the file data handle file_data = request.urlopen(req) - # is this a gzip file - if is_gzip: - # get a handle to the data - file_data = gzip.GzipFile(fileobj=file_data) - - # strip off the .gz if exists - data_file = data_file.replace('.gz', '') - with open(os.path.join(data_dir, data_file), 'wb') as fp: # specify the buffered data block size block = 131072 diff --git a/Dockerfile b/Dockerfile index a65ba0b6..4031f26d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # A docker container with neo4j, java and python for Data Services -FROM neo4j:4.4.10 +FROM neo4j:5.19.0-community-bullseye RUN apt-get update \ && apt-get -y install python3 \ diff --git a/README.md b/README.md index 36f24fad..56dbbe1b 100644 --- a/README.md +++ b/README.md @@ -31,12 +31,20 @@ git clone https://github.com/RobokopU24/ORION.git Next create directories where data sources, graphs, and logs will be stored. ORION_STORAGE - for storing data sources + ORION_GRAPHS - for storing knowledge graphs + ORION_LOGS - for storing logs -You can do this manually, or use the script indicated below to set up a standard configuration (Option 1 or 2). +You can do this manually, or use the script indicated below to set up a default configuration. + +Option 1: Use this script to create the directories and set the environment variables: +``` +cd ~/ORION_root/ORION/ +source ./set_up_test_env.sh +``` -Option 1: Create three directories and set environment variables specifying paths to the locations of those directories. +Option 2: Create three directories and manually set environment variables specifying paths to the locations of those directories. ``` mkdir ~/ORION_root/storage/ export ORION_STORAGE=~/ORION_root/storage/ @@ -48,12 +56,6 @@ mkdir ~/ORION_root/logs/ export ORION_LOGS=~/ORION_root/logs/ ``` -Option 2: Use this script to create the directories and set the environment variables: -``` -cd ~/ORION_root/ORION/ -source ./set_up_test_env.sh -``` - Next create or select a Graph Spec yaml file where the content of knowledge graphs to be built will be specified. Use either of the following options, but not both: @@ -91,11 +93,11 @@ docker-compose up ``` If you want to specify an individual graph you can override the default command with a graph id from your Spec. ``` -docker-compose run --rm data_services python /ORION/Common/build_manager.py Example_Graph_ID +docker-compose run --rm orion python /ORION/Common/build_manager.py Example_Graph_ID ``` To run the ORION pipeline for a single data source, you can use: ``` -docker-compose run --rm data_services python /ORION/Common/load_manager.py Example_Source +docker-compose run --rm orion python /ORION/Common/load_manager.py Example_Source ``` To see available arguments and a list of supported data sources: ``` @@ -142,5 +144,5 @@ Now you can use that source ID in a graph spec to include your new source in a g After you alter the codebase, or if you are experiencing issues or errors you may want to run tests: ``` -docker-compose run --rm data_services pytest /ORION +docker-compose run --rm orion pytest /ORION ``` \ No newline at end of file diff --git a/cli/generate_redundant_kg.py b/cli/generate_redundant_kg.py new file mode 100644 index 00000000..2ece03af --- /dev/null +++ b/cli/generate_redundant_kg.py @@ -0,0 +1,14 @@ +import argparse +from Common.redundant_kg import generate_redundant_kg + + +if __name__ == '__main__': + ap = argparse.ArgumentParser(description='Generate redundant edge files. ' + 'currently expanding from predicate and qualified_predicate.') + ap.add_argument('-i', '--infile', help='Input edge file path', required=True) + ap.add_argument('-o', '--outfile', help='Output edge file path', required=False) + args = vars(ap.parse_args()) + + infile = args['infile'] + edges_file_path = args['outfile'] + generate_redundant_kg(infile, edges_file_path) diff --git a/cli/neo4j_dump.py b/cli/neo4j_dump.py index 51a6e1ec..1f7b50ee 100644 --- a/cli/neo4j_dump.py +++ b/cli/neo4j_dump.py @@ -1,16 +1,25 @@ import argparse +import os +from Common.utils import LoggingUtil from Common.neo4j_tools import create_neo4j_dump +logger = LoggingUtil.init_logging("ORION.cli.neo4j_dump", + line_format='medium', + log_file_path=os.environ['ORION_LOGS']) + if __name__ == '__main__': ap = argparse.ArgumentParser(description='') - ap.add_argument('graph_directory') - ap.add_argument('nodes_filename') - ap.add_argument('edges_filename') + ap.add_argument('nodes_filepath') + ap.add_argument('edges_filepath') + ap.add_argument('output_directory') args = vars(ap.parse_args()) - g_directory = args['graph_directory'] - n_filename = args['nodes_filename'] - e_filename = args['edges_filename'] - create_neo4j_dump(graph_directory=g_directory, - nodes_filename=n_filename, - edges_filename=e_filename) + n_filepath = args['nodes_filepath'] + e_filepath = args['edges_filepath'] + output_directory = args['output_directory'] + + create_neo4j_dump(nodes_filepath=n_filepath, + edges_filepath=e_filepath, + output_directory=output_directory, + logger=logger) + diff --git a/docker-compose.yml b/docker-compose.yml index 41f34d5c..a22dd7b7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,5 @@ -version: "3.7" services: orion: - platform: linux/amd64 build: context: . command: [python, /ORION/Common/build_manager.py, all] @@ -12,9 +10,10 @@ services: - ORION_GRAPH_SPEC - ORION_GRAPH_SPEC_URL - ORION_OUTPUT_URL - - ORION_NEO4J_PASSWORD - EDGE_NORMALIZATION_ENDPOINT - NODE_NORMALIZATION_ENDPOINT + - NAME_RESOLVER_ENDPOINT + - BL_VERSION - PHAROS_DB_HOST - PHAROS_DB_USER - PHAROS_DB_PASSWORD diff --git a/graph_specs/ctkp-graph-spec.yaml b/graph_specs/ctkp-graph-spec.yaml new file mode 100644 index 00000000..cffefab6 --- /dev/null +++ b/graph_specs/ctkp-graph-spec.yaml @@ -0,0 +1,9 @@ +graphs: + + - graph_id: CTKP_Automat + graph_name: Clinical Trials KP + graph_description: 'The Clinical Trials KP, created and maintained by the Multiomics Provider, provides information on Clinical Trials, ultimately derived from researcher submissions to clinicaltrials.gov, via the Aggregate Analysis of Clinical Trials (AACT) database). Information on select trials includes the NCT Identifier of the trial, interventions used, diseases/conditions relevant to the trial, adverse events, etc.' + graph_url: https://github.com/NCATSTranslator/Translator-All/wiki/Clinical-Trials-KP + output_format: neo4j + sources: + - source_id: ClinicalTrialsKP \ No newline at end of file diff --git a/graph_specs/default-graph-spec.yml b/graph_specs/default-graph-spec.yml index 1228ec87..aa47ce33 100644 --- a/graph_specs/default-graph-spec.yml +++ b/graph_specs/default-graph-spec.yml @@ -26,6 +26,7 @@ graphs: - source_id: CHEBIProps - source_id: CTD - source_id: DrugCentral + - source_id: DrugMechDB - source_id: GtoPdb - source_id: Hetio - source_id: HGNC @@ -38,7 +39,6 @@ graphs: merge_strategy: connected_edge_subset - source_id: PANTHER - source_id: PHAROS - source_version: 'v6_13_4' - source_id: Reactome - source_id: textminingkp - source_id: STRING-DB-Human @@ -85,10 +85,10 @@ graphs: - source_id: OntologicalHierarchy merge_strategy: connected_edge_subset - #- graph_id: + #- graph_id: # graph_name: DisGeNET # graph_description: 'Disease – Gene Network (DisGeNET) is an open-source database on genes and variants associated with human disease. The integrated data are derived from curated repositories, GWAS catalogs, animal models, and the scientific literature.' - # graph_url: + # graph_url: # output_format: # sources: # - source_id: @@ -105,6 +105,16 @@ graphs: - source_id: OntologicalHierarchy merge_strategy: connected_edge_subset +# - graph_id: DrugMechDB_Automat +# graph_name: DrugMechDB +# graph_description: 'A database of paths that represent the mechanism of action from a drug to a disease in an indication.' +# graph_url: https://sulab.github.io/DrugMechDB/ +# output_format: neo4j +# sources: +# - source_id: DrugMechDB +# - source_id: OntologicalHierarchy +# merge_strategy: connected_edge_subset + - graph_id: GenomeAllianceOrthologs_Automat graph_name: Alliance of Genome Resources graph_description: 'The Alliance of Genome Resources is a consortium of the owners of seven model organism databases (MODs) and the Gene Ontology (GO) Consortium, whose goal is to provide an integrated view of their data to all biologists, clinicians, and other interested parties. The Alliance provides the results of all methods that have been benchmarked by the Quest for Orthologs Consortium (QfO), as well as curated ortholog inferences from HGNC (human, mouse, rat genes).' diff --git a/graph_specs/dug-graph-spec.yaml b/graph_specs/dug-graph-spec.yaml new file mode 100644 index 00000000..c223b009 --- /dev/null +++ b/graph_specs/dug-graph-spec.yaml @@ -0,0 +1,24 @@ +graphs: + - graph_id: DugSemanticSearchKG + graph_name: KG for Semantic search + graph_description: + graph_url: + conflation: True + output_format: + sources: + - source_id: CHEBIProps + - source_id: CTD + - source_id: DrugCentral + - source_id: GtoPdb + - source_id: Hetio + - source_id: HGNC + - source_id: HMDB + - source_id: IntAct + - source_id: MonarchKG + - source_id: MONDOProps + - source_id: OntologicalHierarchy + merge_strategy: connected_edge_subset + - source_id: PANTHER + - source_id: PHAROS + - source_id: Reactome + - source_id: UbergraphNonredundant \ No newline at end of file diff --git a/graph_specs/litcoin-graph-spec.yml b/graph_specs/litcoin-graph-spec.yml new file mode 100644 index 00000000..d48dc8d3 --- /dev/null +++ b/graph_specs/litcoin-graph-spec.yml @@ -0,0 +1,19 @@ +graphs: + + - graph_id: LitCoinSapBERT + graph_name: LitCoinSapBERT + graph_description: A graph for the LitCoin project using SapBERT + conflation: True + strict_normalization: True + output_format: neo4j + sources: + - source_id: LitCoinSapBERT + + - graph_id: LitCoin + graph_name: LitCoin + graph_description: A graph for the LitCoin project using Name Resolver for name resolution + conflation: True + strict_normalization: True + output_format: neo4j + sources: + - source_id: LitCoin \ No newline at end of file diff --git a/graph_specs/rule-mining-graph-spec.yaml b/graph_specs/rule-mining-graph-spec.yaml new file mode 100644 index 00000000..8588e9b5 --- /dev/null +++ b/graph_specs/rule-mining-graph-spec.yaml @@ -0,0 +1,30 @@ +# Rule mining graph spec +graphs: + + - graph_id: RobokopRuleMiningKG + graph_name: ROBOKOP Rule Mining KG + graph_description: 'The ROBOKOP Knowledge Graph (ROBOKOP KG) is an open-source biomedical KG that supports the ROBOKOP application and currently contains millions of biomedical relationships derived from dozens of integrated and harmonized biological knowledge sources and bio-ontologies. The ROBOKOP KG includes curated components of most of the Automat KGs, as well as other knowledge sources. Most of the ROBOKOP knowledge sources are curated. However, the ROBOKOP KG also includes text-mined assertions from PubMed and PubMed Central that have been derived from natural language processing (NLP). Note that text-based assertions, while providing valuable information, must be interpreted with caution, as NLP algorithms may introduce false assertions. This version excludes text mining edges and is used for rule mining.' + graph_url: + conflation: True + output_format: redundant_jsonl + sources: + - source_id: BINDING-DB + - source_id: CHEBIProps + - source_id: CTD + - source_id: DrugCentral + - source_id: DrugMechDB + - source_id: GtoPdb + - source_id: Hetio + - source_id: HGNC + - source_id: HMDB + - source_id: HumanGOA + - source_id: IntAct + - source_id: MonarchKG + - source_id: MONDOProps + - source_id: OntologicalHierarchy + merge_strategy: connected_edge_subset + - source_id: PANTHER + - source_id: PHAROS + - source_id: Reactome + - source_id: STRING-DB-Human + - source_id: UbergraphNonredundant \ No newline at end of file diff --git a/graph_specs/yeast-graph-spec.yml b/graph_specs/yeast-graph-spec.yml index 44f55205..3c972e33 100644 --- a/graph_specs/yeast-graph-spec.yml +++ b/graph_specs/yeast-graph-spec.yml @@ -26,6 +26,7 @@ graphs: - source_id: CHEBIProps - source_id: CTD - source_id: DrugCentral + - source_id: DrugMechDB - source_id: GtoPdb - source_id: Hetio - source_id: HGNC @@ -38,7 +39,6 @@ graphs: merge_strategy: connected_edge_subset - source_id: PANTHER - source_id: PHAROS - source_version: 'v6_13_4' - source_id: Reactome - source_id: textminingkp - source_id: STRING-DB-Human diff --git a/helm/orion/renci-slim.yaml b/helm/orion/renci-slim.yaml new file mode 100644 index 00000000..434d2fa9 --- /dev/null +++ b/helm/orion/renci-slim.yaml @@ -0,0 +1,15 @@ +# Values for running ORION on Sterling at RENCI with minimal resource consumption +orion: + neo4jScratchVolume: + size: 40Gi + resources: + limits: + cpu: "2500m" + memory: 8Gi + ephemeral-storage: 2Gi + requests: + cpu: "2500m" + memory: 6Gi + ephemeral-storage: 1Gi + + diff --git a/helm/orion/renci-values.yaml b/helm/orion/renci-values.yaml index 245f9c36..4a81cd9d 100644 --- a/helm/orion/renci-values.yaml +++ b/helm/orion/renci-values.yaml @@ -13,21 +13,21 @@ orion: image: repository: ghcr.io/robokopu24/orion pullPolicy: IfNotPresent - tag: v1.0.9 + tag: v1.0.14 graphsVolume: use_nfs: true nfs_server: na-projects.edc.renci.org - nfs_path: /stars/ORION + nfs_path: /stars/Data_services nfs_mount_subpath: biolink3/graphs/ sourcesVolume: use_nfs: true nfs_server: na-projects.edc.renci.org - nfs_path: /stars/ORION + nfs_path: /stars/Data_services nfs_mount_subpath: biolink3/storage/ extraVolume: use_extra_volume: true nfs_server: na-projects.edc.renci.org - nfs_path: /stars/ORION + nfs_path: /stars/Data_services nfs_mount_subpath: sterling/ neo4jScratchVolume: size: 80Gi @@ -46,7 +46,8 @@ orion: normalization: nodeNormEndpoint: https://nodenormalization-sri.renci.org/ edgeNormEndpoint: https://bl-lookup-sri.renci.org/ - outputURL: https://stars.renci.org/var/plater/bl-3.5.4/ + bl_version: 4.2.1 + outputURL: https://stars.renci.org/var/plater/bl-4.2.1/ pharos: host: pod-host-or-ip @@ -56,9 +57,13 @@ pharos: db_name: PHAROS drugcentral: - host: pod-host-or-ip - port: 5432 - user: dc-user - password: dc-pass + host: unmtid-dbs.net + port: 5433 + user: drugman + password: dosage db_name: drugcentral + # host: pod-host-or-ip + # port: 5432 + # user: dc-user + # password: dc-pass diff --git a/helm/orion/templates/graph-builder.yaml b/helm/orion/templates/graph-builder.yaml index 31c873a5..b695167a 100644 --- a/helm/orion/templates/graph-builder.yaml +++ b/helm/orion/templates/graph-builder.yaml @@ -42,6 +42,10 @@ spec: {{- end }} - mountPath: /data name: ds-neo4j-scratch-volume + subPath: neo4j_data + - mountPath: /logs + name: ds-neo4j-scratch-volume + subPath: neo4j_logs env: - name: ORION_STORAGE value: /ORION_storage @@ -56,10 +60,10 @@ spec: {{- end }} - name: ORION_LOGS value: /ORION_logs - - name: ORION_NEO4J_PASSWORD - value: ds-password - name: ORION_OUTPUT_URL value: {{ .Values.orion.outputURL }} + - name: BL_VERSION + value: {{ .Values.orion.normalization.bl_version }} {{- if .Values.orion.normalization.nodeNormEndpoint }} - name: NODE_NORMALIZATION_ENDPOINT value: {{ .Values.orion.normalization.nodeNormEndpoint }} @@ -112,6 +116,10 @@ spec: {{- end }} - mountPath: /data name: ds-neo4j-scratch-volume + subPath: neo4j_data + - mountPath: /logs + name: ds-neo4j-scratch-volume + subPath: neo4j_logs env: - name: ORION_STORAGE value: /ORION_storage @@ -126,10 +134,10 @@ spec: {{- end }} - name: ORION_LOGS value: /ORION_logs - - name: ORION_NEO4J_PASSWORD - value: ds-password - name: ORION_OUTPUT_URL value: {{ .Values.orion.outputURL }} + - name: BL_VERSION + value: {{ .Values.orion.normalization.bl_version }} {{- if .Values.orion.normalization.nodeNormEndpoint }} - name: NODE_NORMALIZATION_ENDPOINT value: {{ .Values.orion.normalization.nodeNormEndpoint }} diff --git a/parsers/BINDING/src/loadBINDINGDB.py b/parsers/BINDING/src/loadBINDINGDB.py index 524a4c06..9b2b0db1 100644 --- a/parsers/BINDING/src/loadBINDINGDB.py +++ b/parsers/BINDING/src/loadBINDINGDB.py @@ -1,14 +1,17 @@ import os import enum import math -from zipfile import ZipFile as zipfile -import requests as rq +import json +import requests + +from zipfile import ZipFile +from requests.adapters import HTTPAdapter, Retry from parsers.BINDING.src.bindingdb_constraints import LOG_SCALE_AFFINITY_THRESHOLD #Change the binding affinity threshold here. Default is 10 uM Ki,Kd,EC50,orIC50 -from Common.utils import GetData +from Common.utils import GetData, GetDataPullError from Common.loader_interface import SourceDataLoader from Common.extractor import Extractor -from Common.node_types import PUBLICATIONS, AFFINITY +from Common.biolink_constants import PUBLICATIONS, AFFINITY, AFFINITY_PARAMETER, KNOWLEDGE_LEVEL, AGENT_TYPE, KNOWLEDGE_ASSERTION, MANUAL_AGENT # Full Binding Data. @@ -31,7 +34,7 @@ def negative_log(concentration_nm): ### This function converts nanomolar concent return -(math.log10(concentration_nm*(10**-9))) def generate_zipfile_rows(zip_file_path, file_inside_zip, delimiter='\\t'): - with zipfile(zip_file_path, 'r') as zip_file: + with ZipFile(zip_file_path, 'r') as zip_file: with zip_file.open(file_inside_zip, 'r') as file: for line in file: yield str(line).split(delimiter) @@ -50,7 +53,7 @@ class BINDINGDBLoader(SourceDataLoader): source_data_url = "https://www.bindingdb.org/rwd/bind/chemsearch/marvin/SDFdownload.jsp?all_download=yes" license = "All data and download files in bindingDB are freely available under a 'Creative Commons BY 3.0' license.'" attribution = 'https://www.bindingdb.org/rwd/bind/info.jsp' - parsing_version = '1.4' + parsing_version = '1.6' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -59,26 +62,26 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ # call the super super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) - #6 is the stand in threshold value until a better value can be determined - #We may not even use the thresholds, that way all data can be captured. + # 5 is the stand in threshold value until a better value can be determined + # We may not even use the thresholds, that way all data can be captured. self.affinity_threshold = LOG_SCALE_AFFINITY_THRESHOLD self.measure_to_predicate = { - "pKi": "biolink:binds", + "pKi": "{DGIDB}:inhibitor", #inhibition constant "pIC50": "CTD:decreases_activity_of", - "pKd": "biolink:binds", + "pKd": "RO:0002436", "pEC50": "CTD:increases_activity_of", - "k_on": "biolink:binds", - "k_off": "biolink:binds" + "k_on": "RO:0002436", + "k_off": "RO:0002436" } self.bindingdb_version = None self.bindingdb_version = self.get_latest_source_version() self.bindingdb_data_url = f"https://www.bindingdb.org/bind/downloads/" - self.BD_archive_file_name = f"BindingDB_All_{self.bindingdb_version}_tsv.zip" - self.BD_file_name = f"BindingDB_All_{self.bindingdb_version}.tsv" - self.data_files = [self.BD_archive_file_name] + self.bd_archive_file_name = f"BindingDB_All_{self.bindingdb_version}_tsv.zip" + self.bd_file_name = f"BindingDB_All.tsv" + self.data_files = [self.bd_archive_file_name] def get_latest_source_version(self) -> str: """ @@ -87,22 +90,36 @@ def get_latest_source_version(self) -> str: """ if self.bindingdb_version: return self.bindingdb_version - ### The method below gets the database version from the html, but this may be subject to change. ### - binding_db_download_page_response = rq.get('https://www.bindingdb.org/rwd/bind/chemsearch/marvin/Download.jsp') - version_index = binding_db_download_page_response.text.index('BindingDB_All_2D_') + 17 - bindingdb_version = binding_db_download_page_response.text[version_index:version_index + 6] - - return f"{bindingdb_version}" + try: + s = requests.Session() + retries = Retry(total=5, + backoff_factor=2) + s.mount('https://', HTTPAdapter(max_retries=retries)) + + ### The method below gets the database version from the html, but this may be subject to change. ### + binding_db_download_page_response = requests.get('https://www.bindingdb.org/rwd/bind/chemsearch/marvin/Download.jsp', timeout=8) + version_index = binding_db_download_page_response.text.index('BindingDB_All_2D_') + 17 + bindingdb_version = binding_db_download_page_response.text[version_index:version_index + 6] + self.bindingdb_version = bindingdb_version + return f"{bindingdb_version}" + + except requests.exceptions.SSLError: + # BINDING-DB often has ssl related errors with the jsp page + error_message = f'BINDING-DB had an SSL error while attempting to retrieve version..' + except requests.exceptions.Timeout: + error_message = f'BINDING-DB timed out attempting to retrieve version...' + except ValueError: + error_message = f'BINDING-DB get_latest_source_version got a response but could not determine the version' + raise GetDataPullError(error_message=error_message) def get_data(self) -> int: """ Gets the bindingdb data. - """ + # download the zipped data data_puller = GetData() - for source in self.data_files: - source_url = f"{self.bindingdb_data_url}{source}" - data_puller.pull_via_http(source_url, self.data_path) + source_url = f"{self.bindingdb_data_url}{self.bd_archive_file_name}" + data_puller.pull_via_http(source_url, self.data_path) return True def parse_data(self) -> dict: @@ -116,7 +133,8 @@ def parse_data(self) -> dict: data_store= dict() columns = [[x.value,x.name] for x in BD_EDGEUMAN if x.name not in ['PMID','PUBCHEM_AID','PATENT_NUMBER','PUBCHEM_CID','UNIPROT_TARGET_CHAIN']] - for n,row in enumerate(generate_zipfile_rows(os.path.join(self.data_path,self.BD_archive_file_name), self.BD_file_name)): + zipped_data_path = os.path.join(self.data_path, self.bd_archive_file_name) + for n,row in enumerate(generate_zipfile_rows(zipped_data_path, self.bd_file_name)): if n == 0: continue if self.test_mode: @@ -129,9 +147,9 @@ def parse_data(self) -> dict: if (ligand == '') or (protein == ''): # Check if Pubchem or UniProt ID is missing. continue - publication = f"PMID:{row[BD_EDGEUMAN.PMID.value]}" if row[BD_EDGEUMAN.PMID.value] != '' else None - assay_id = f"PUBCHEM.AID:{row[BD_EDGEUMAN.PUBCHEM_AID.value]}" if row[BD_EDGEUMAN.PUBCHEM_AID.value] != '' else None - patent = f"PATENT:{row[BD_EDGEUMAN.PATENT_NUMBER.value]}" if row[BD_EDGEUMAN.PATENT_NUMBER.value] != '' else None + publication = f"PMID:{row[BD_EDGEUMAN.PMID.value]}" if row[BD_EDGEUMAN.PMID.value] else None + assay_id = f"PUBCHEM.AID:{row[BD_EDGEUMAN.PUBCHEM_AID.value]}" if row[BD_EDGEUMAN.PUBCHEM_AID.value] else None + patent = f"PATENT:{row[BD_EDGEUMAN.PATENT_NUMBER.value]}" if row[BD_EDGEUMAN.PATENT_NUMBER.value] else None for column in columns: if row[column[0]] != '': @@ -142,22 +160,24 @@ def parse_data(self) -> dict: # already has another measurement type in the row, and that other measurement has far more value. continue ligand_protein_measure_key = f"{ligand}~{protein}~{measure_type}" - # The section below checks through all of the previous entry keys and uses - if ligand_protein_measure_key in data_store: # TODO start here + # if we already created an entry with the same ligand-protein-measure_type key, use it + if ligand_protein_measure_key in data_store: entry = data_store[ligand_protein_measure_key] - found_key = True else: - entry = {} - entry.update({'ligand': f"PUBCHEM.COMPOUND:{ligand}"}) - entry.update({'protein': f"UniProtKB:{protein}"}) - entry.update({'predicate': self.measure_to_predicate[measure_type]}) - entry.update({'affinity_parameter': measure_type}) - entry.update({'supporting_affinities': []}) - entry.update({'publications': []}) - entry.update({'pubchem_assay_ids': []}) - entry.update({'patent_ids': []}) + # otherwise make what will turn into an edge + entry = {'ligand': f"PUBCHEM.COMPOUND:{ligand}", + 'protein': f"UniProtKB:{protein}", + 'predicate': self.measure_to_predicate[measure_type], + AFFINITY_PARAMETER: measure_type, + 'supporting_affinities': [], + PUBLICATIONS: [], + 'pubchem_assay_ids': [], + 'patent_ids': [], + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} data_store[ligand_protein_measure_key] = entry - #If there's a > in the result, it means that this is a dead compound, i.e. it won't bass + + # If there's a > in the result, it means that this is a dead compound, i.e. it won't pass # our activity/inhibition threshold if ">" in row[column[0]]: continue @@ -166,8 +186,8 @@ def parse_data(self) -> dict: if sa == 0: continue entry["supporting_affinities"].append(sa) - if publication is not None and publication not in entry["publications"]: - entry["publications"].append(publication) + if publication is not None and publication not in entry[PUBLICATIONS]: + entry[PUBLICATIONS].append(publication) if assay_id is not None and assay_id not in entry["pubchem_assay_ids"]: entry["pubchem_assay_ids"].append(assay_id) if patent is not None and patent not in entry["patent_ids"]: @@ -178,33 +198,29 @@ def parse_data(self) -> dict: if len(entry["supporting_affinities"]) == 0: bad_entries.add(key) continue - if len(entry["publications"]) == 0: - del entry["publications"] + if len(entry[PUBLICATIONS]) == 0: + del entry[PUBLICATIONS] if len(entry["pubchem_assay_ids"]) == 0: del entry["pubchem_assay_ids"] if len(entry["patent_ids"]) == 0: del entry["patent_ids"] try: average_affinity = sum(entry["supporting_affinities"])/len(entry["supporting_affinities"]) - entry["affinity"] = round(negative_log(average_affinity),2) + entry[AFFINITY] = round(negative_log(average_affinity),2) entry["supporting_affinities"] = [round(negative_log(x),2) for x in entry["supporting_affinities"]] - except: + except Exception as e: bad_entries.add(key) + self.logger.warning(f'Error calculating affinities for entry: {json.dumps(entry,indent=4)} (error: {e})') - import json - for badkey in bad_entries: - bad_entry = data_store.pop(badkey) - if len(bad_entry["supporting_affinities"]) == 0: - continue - print(json.dumps(bad_entry,indent=4)) + for bad_key in bad_entries: + del data_store[bad_key] extractor = Extractor(file_writer=self.output_file_writer) - extractor.json_extract(data_store, - lambda item: data_store[item]['ligand'], # subject id - lambda item: data_store[item]['protein'], # object id - lambda item: data_store[item]['predicate'], # predicate - lambda item: {}, #Node 1 props - lambda item: {}, #Node 2 props - lambda item: {key:value for key,value in data_store[item].items() if key not in ['ligand','protein']} #Edge props - ) + extractor.json_extract(data_store.values(), + lambda item: item['ligand'], # subject id + lambda item: item['protein'], # object id + lambda item: item['predicate'], # predicate + lambda item: {}, # subject props + lambda item: {}, # object props + lambda item: {k: v for k, v in item.items() if key not in ['ligand', 'protein', 'predicate']}) #Edge props return extractor.load_metadata diff --git a/parsers/CTD/src/loadCTD.py b/parsers/CTD/src/loadCTD.py index 814891b2..608f561d 100644 --- a/parsers/CTD/src/loadCTD.py +++ b/parsers/CTD/src/loadCTD.py @@ -12,7 +12,7 @@ from Common.loader_interface import SourceDataLoader, SourceDataFailedError from Common.kgxmodel import kgxnode, kgxedge from Common.prefixes import CTD, NCBITAXON, MESH -from Common.node_types import PUBLICATIONS +from Common.biolink_constants import * ############## @@ -30,14 +30,26 @@ class CTDLoader(SourceDataLoader): source_data_url = "http://ctdbase.org/reports/" license = "http://ctdbase.org/about/publications/#citing" attribution = "http://ctdbase.org/about/" - parsing_version: str = '1.3' + parsing_version: str = '1.5' + # some CTD predicates no longer have mappings in the biolink model, convert them to something that will normalize predicate_conversion_map = { 'CTD:decreases_molecular_interaction_with': 'CTD:decreases_molecular_interaction', 'CTD:increases_molecular_interaction_with': 'CTD:increases_molecular_interaction', 'CTD:ameliorates': 'biolink:treats_or_applied_or_studied_to_treat' } + # mappings for predicate: (knowledge level, agent type) + exposure_events_KL_AT_lookup = { + 'CTD:positive_correlation': (STATISTICAL_ASSOCIATION, MANUAL_AGENT), + 'CTD:negative_correlation': (STATISTICAL_ASSOCIATION, MANUAL_AGENT), + 'CTD:prediction_hypothesis': (PREDICATION, NOT_PROVIDED) + } + chemical_disease_KL_AT_lookup = { + 'CTD:contributes_to': (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + 'biolink:treats_or_applied_or_studied_to_treat': (KNOWLEDGE_ASSERTION, MANUAL_AGENT) + } + def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ :param test_mode - sets the run into test mode @@ -63,11 +75,10 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): self.ctd_data_files = [self.ctd_chemical_to_disease_file, self.ctd_exposure_events_file] - self.data_files = [] - self.data_files.extend(self.hand_curated_files) - self.data_files.extend(self.ctd_data_files) + self.data_files = self.hand_curated_files + self.ctd_data_files - self.previous_node_ids = set() + self.final_record_counter: int = 0 + self.final_skipped_record_counter: int = 0 def get_latest_source_version(self) -> str: """ @@ -75,6 +86,7 @@ def get_latest_source_version(self) -> str: :return: """ + try: # load the web page for CTD html_page: requests.Response = requests.get('http://ctdbase.org/about/dataStatus.go') @@ -90,8 +102,7 @@ def get_latest_source_version(self) -> str: # save the value return version.text.split(':')[1].strip().replace(' ', '_') except Exception as e: - pass - raise GetDataPullError(error_message=f'Unable to determine latest version for CTD') + raise GetDataPullError(error_message=f'Unable to determine latest version for CTD: {e}') def get_data(self): """ @@ -114,44 +125,26 @@ def parse_data(self) -> dict: :return: """ - final_record_count: int = 0 - final_skipped_count: int = 0 - # process chemical to gene (expanded) curated_files_archive_path = os.path.join(self.data_path, self.hand_curated_data_archive) - records, skipped = self.chemical_to_gene_exp(curated_files_archive_path, - self.hand_curated_chemical_to_gene_file) - - # add to the final counts - final_record_count += records - final_skipped_count += skipped + self.chemical_to_gene_exp(curated_files_archive_path, + self.hand_curated_chemical_to_gene_file) # process disease to exposure exposures_file_path = os.path.join(self.data_path, self.ctd_exposure_events_file) - records, skipped = self.disease_to_exposure(exposures_file_path) - - # add to the final counts - final_record_count += records - final_skipped_count += skipped + self.disease_to_exposure(exposures_file_path) # disease to chemical disease_to_chemical_file_path = os.path.join(self.data_path, self.ctd_chemical_to_disease_file) - records, skipped = self.disease_to_chemical(disease_to_chemical_file_path) + self.disease_to_chemical(disease_to_chemical_file_path) - # add to the final counts - final_record_count += records - final_skipped_count += skipped - - # load up the metadata - load_metadata: dict = { - 'num_source_lines': final_record_count, - 'unusable_source_lines': final_skipped_count + parse_metadata: dict = { + 'num_source_lines': self.final_record_counter, + 'unusable_source_lines': self.final_skipped_record_counter } + return parse_metadata - # return the metadata to the caller - return load_metadata - - def chemical_to_gene_exp(self, archive_path: str, chemical_to_gene_file: str) -> (list, list, int, int): + def chemical_to_gene_exp(self, archive_path: str, chemical_to_gene_file: str): """ Parses the data file to create chemical to gene nodes and relationships @@ -201,9 +194,6 @@ def chemical_to_gene_exp(self, archive_path: str, chemical_to_gene_file: str) -> skipped_record_counter += 1 continue - # get the edge predicate - predicate = self.convert_predicates(f"{CTD}:{predicate_label}") - # capitalize the node IDs chemical_id: str = r['chemicalID'].upper() gene_id: str = r['geneID'].upper() @@ -224,6 +214,13 @@ def chemical_to_gene_exp(self, archive_path: str, chemical_to_gene_file: str) -> edge_subject: str = gene_id edge_object: str = chemical_id + # get the edge predicate + predicate = self.convert_predicates(f"{CTD}:{predicate_label}") + + # all edges from this file get the same KL/AT + edge_props[KNOWLEDGE_LEVEL] = KNOWLEDGE_ASSERTION + edge_props[AGENT_TYPE] = MANUAL_AGENT + # save the edge new_edge = kgxedge(edge_subject, edge_object, @@ -232,10 +229,10 @@ def chemical_to_gene_exp(self, archive_path: str, chemical_to_gene_file: str) -> edgeprops=edge_props) self.output_file_writer.write_kgx_edge(new_edge) - # return the record counters to the caller - return record_counter, skipped_record_counter + self.final_record_counter += record_counter + self.final_skipped_record_counter += skipped_record_counter - def disease_to_exposure(self, file_path: str) -> (list, list, int, int): + def disease_to_exposure(self, file_path: str): """ Parses the data file to create disease to exposure nodes and relationships @@ -276,6 +273,8 @@ def disease_to_exposure(self, file_path: str) -> (list, list, int, int): continue else: predicate: str = self.convert_predicates(f"{CTD}:{predicate_label}") + knowledge_level, agent_type = self.exposure_events_KL_AT_lookup.get(predicate, + (NOT_PROVIDED, NOT_PROVIDED)) # save the disease node disease_id = f'{MESH}:' + r['diseaseid'] @@ -292,11 +291,13 @@ def disease_to_exposure(self, file_path: str) -> (list, list, int, int): disease_id, predicate=predicate, primary_knowledge_source=self.provenance_id, - edgeprops={PUBLICATIONS: [f"PMID:{r['reference']}"]}) + edgeprops={PUBLICATIONS: [f"PMID:{r['reference']}"], + KNOWLEDGE_LEVEL: knowledge_level, + AGENT_TYPE: agent_type}) self.output_file_writer.write_kgx_edge(new_edge) - # return the node and edge lists to the caller - return record_counter, skipped_record_counter + self.final_record_counter += record_counter + self.final_skipped_record_counter += skipped_record_counter def disease_to_chemical(self, file_path: str): """ @@ -331,7 +332,7 @@ def disease_to_chemical(self, file_path: str): # iterate through node groups and create the edge records. while record_counter < record_count: - # if its the first time in prime the pump + # if it's the first time in prime the pump if first: # save the disease id cur_disease_id = sorted_data[record_counter]['DiseaseID'] @@ -353,7 +354,7 @@ def disease_to_chemical(self, file_path: str): # increment the record counter record_counter += 1 - # insure we dont overrun the list + # ensure we dont overrun the list if record_counter >= record_count: break @@ -431,13 +432,12 @@ def disease_to_chemical(self, file_path: str): if predicate == self.therapeutic_predicate: publications = treats_refs - # was this node already added + # write the disease node if it wasn't already written + # (note that this check is not really necessary because the file writer will prevent duplicates, + # but it's slightly more efficient to not create the node object again) if not disease_node_added: - # add the disease node disease_node = kgxnode(cur_disease_id.upper(), name=cur_disease_name) self.output_file_writer.write_kgx_node(disease_node) - - # set the flag so we dont duplicate adding this node disease_node_added = True # add the chemical node @@ -447,22 +447,26 @@ def disease_to_chemical(self, file_path: str): # add the edge predicate = self.convert_predicates(predicate) + knowledge_level, agent_type = self.chemical_disease_KL_AT_lookup.get(predicate, (NOT_PROVIDED, + NOT_PROVIDED)) new_edge = kgxedge(chemical_id, cur_disease_id.upper(), predicate=predicate, primary_knowledge_source=self.provenance_id, - edgeprops={PUBLICATIONS: publications}) + edgeprops={PUBLICATIONS: publications, + KNOWLEDGE_LEVEL: knowledge_level, + AGENT_TYPE: agent_type}) self.output_file_writer.write_kgx_edge(new_edge) - # insure we dont overrun the list + # ensure we dont overrun the list if record_counter >= record_count: break # save the next disease id cur_disease_id = sorted_data[record_counter]['DiseaseID'] - # return the node/edge lists and counters to the caller - return record_counter, skipped_record_counter + self.final_record_counter += record_counter + self.final_skipped_record_counter += skipped_record_counter @staticmethod def check_expanded_gene_chemical_row(r): diff --git a/parsers/FooDB/src/loadFDB.py b/parsers/FooDB/src/loadFDB.py index 2622d4bc..149a78d1 100644 --- a/parsers/FooDB/src/loadFDB.py +++ b/parsers/FooDB/src/loadFDB.py @@ -82,6 +82,7 @@ def get_data(self): # and get a reference to the data gatherer gd: GetData = GetData(self.logger.level) + if(self.full_url_path==None): self.get_latest_source_version() # get all the files noted above file_count, foodb_dir, self.tar_dir_name = gd.get_foodb_files(self.full_url_path, self.data_path, self.archive_name, self.data_files) diff --git a/parsers/GOA/src/loadGOA.py b/parsers/GOA/src/loadGOA.py index e7332019..3d1337e0 100644 --- a/parsers/GOA/src/loadGOA.py +++ b/parsers/GOA/src/loadGOA.py @@ -7,7 +7,7 @@ from Common.extractor import Extractor from io import TextIOWrapper from Common.utils import GetData -from Common.node_types import PRIMARY_KNOWLEDGE_SOURCE, PUBLICATIONS +from Common.biolink_constants import * from Common.prefixes import NCBITAXON @@ -32,6 +32,49 @@ class DATACOLS(enum.IntEnum): Gene_Product_Form_ID = 16 +GOA_PREDICATES = {'enables': 'RO:0002327', + 'involved_in': 'RO:0002331', + 'located_in': 'RO:0001025', + 'contributes_to': 'RO:0002326', + 'acts_upstream_of': 'RO:0002263', + 'part_of': 'BFO:0000050', + 'acts_upstream_of_positive_effect': 'RO:0004034', + 'is_active_in': 'RO:0002432', + 'acts_upstream_of_negative_effect': 'RO:0004035', + 'colocalizes_with': 'RO:0002325', + 'acts_upstream_of_or_within': 'RO:0002264', + 'acts_upstream_of_or_within_positive_effect': 'RO:0004032', + 'acts_upstream_of_or_within_negative_effect': 'RO:0004033'} + +GOA_EVIDENCE_CODE_TO_KL_AT = { + "EXP": (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + "IDA": (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + "IPI": (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + "IMP": (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + "IGI": (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + "IEP": (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + "HTP": (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + "HDA": (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + "HMP": (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + "HGI": (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + "HEP": (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + "IBA": (PREDICATION, MANUAL_VALIDATION_OF_AUTOMATED_AGENT), + "IBD": (PREDICATION, MANUAL_VALIDATION_OF_AUTOMATED_AGENT), + "IKR": (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + "IRD": (PREDICATION, MANUAL_AGENT), + "ISS": (PREDICATION, MANUAL_VALIDATION_OF_AUTOMATED_AGENT), + "ISO": (PREDICATION, MANUAL_VALIDATION_OF_AUTOMATED_AGENT), + "ISA": (PREDICATION, MANUAL_VALIDATION_OF_AUTOMATED_AGENT), + "ISM": (PREDICATION, MANUAL_VALIDATION_OF_AUTOMATED_AGENT), + "IGC": (PREDICATION, MANUAL_AGENT), + "RCA": (PREDICATION, MANUAL_VALIDATION_OF_AUTOMATED_AGENT), + "TAS": (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + "NAS": (PREDICATION, MANUAL_AGENT), + "IC": (PREDICATION, MANUAL_AGENT), + "ND": (NOT_PROVIDED, NOT_PROVIDED), + "IEA": (PREDICATION, AUTOMATED_AGENT) +} + ############## # Class: UniProtKB GOA loader # @@ -47,7 +90,7 @@ class GOALoader(SourceDataLoader): source_data_url = "ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/" license = "https://www.ebi.ac.uk/about/terms-of-use/" attribution = "https://www.ebi.ac.uk/GOA/publications" - parsing_version = '1.1' + parsing_version = '1.2' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -140,20 +183,23 @@ def parse_data(self) -> dict: return extractor.load_metadata -GOA_PREDICATES = {'enables':'RO:0002327', - 'involved_in':'RO:0002331', - 'located_in':'RO:0001025', - 'contributes_to':'RO:0002326', - 'acts_upstream_of':'RO:0002263', - 'part_of':'BFO:0000050', - 'acts_upstream_of_positive_effect':'RO:0004034', - 'is_active_in':'RO:0002432', - 'acts_upstream_of_negative_effect':'RO:0004035', - 'colocalizes_with':'RO:0002325', - 'acts_upstream_of_or_within':'RO:0002264', - 'acts_upstream_of_or_within_positive_effect':'RO:0004032', - 'acts_upstream_of_or_within_negative_effect':'RO:0004033'} - +def get_goa_edge_properties(line: list): + try: + knowledge_level, agent_type = GOA_EVIDENCE_CODE_TO_KL_AT[line[DATACOLS.Evidence_Code.value]] + except KeyError as k: + knowledge_level, agent_type = NOT_PROVIDED, NOT_PROVIDED + + edge_properties = {PRIMARY_KNOWLEDGE_SOURCE: GOALoader.provenance_id, + KNOWLEDGE_LEVEL: knowledge_level, + AGENT_TYPE: agent_type} + publications = [] + evidence_field = line[DATACOLS.DB_Reference.value] + for evidence in evidence_field.split('|'): + if 'PMID' in evidence: + publications.append(evidence) + if publications: + edge_properties[PUBLICATIONS] = publications + return edge_properties def get_goa_predicate(line: list): supplied_qualifier = line[DATACOLS.Qualifier.value] @@ -174,18 +220,6 @@ def get_goa_predicate(line: list): return GOA_PREDICATES[supplied_qualifier] -def get_goa_edge_properties(line: list): - edge_properties = {PRIMARY_KNOWLEDGE_SOURCE: GOALoader.provenance_id} - publications = [] - evidence_field = line[DATACOLS.DB_Reference.value] - for evidence in evidence_field.split('|'): - if 'PMID' in evidence: - publications.append(evidence) - if publications: - edge_properties[PUBLICATIONS] = publications - return edge_properties - - def get_goa_subject_props(line: list): taxon_id = line[DATACOLS.Taxon_Interacting_taxon].split('|')[0].split(':')[-1] return {"taxon": f'{NCBITAXON}:{taxon_id}'} if taxon_id else {} diff --git a/parsers/GTEx/src/loadGTEx.py b/parsers/GTEx/src/loadGTEx.py index 335c6c46..dc15ea93 100644 --- a/parsers/GTEx/src/loadGTEx.py +++ b/parsers/GTEx/src/loadGTEx.py @@ -4,9 +4,8 @@ import argparse from urllib import request from Common.normalization import NodeNormalizer -from Common.utils import LoggingUtil -from Common.loader_interface import SourceDataLoader, SourceDataBrokenError, SourceDataFailedError -from Common.node_types import SEQUENCE_VARIANT, GENE +from Common.loader_interface import SourceDataLoader +from Common.biolink_constants import * from Common.prefixes import HGVS, UBERON from Common.hgvs_utils import convert_variant_to_hgvs @@ -19,10 +18,11 @@ class GTExLoader(SourceDataLoader): source_data_url = "https://storage.googleapis.com/gtex_analysis_v8/single_tissue_qtl_data/" license = "https://www.gtexportal.org/home/documentationPage" attribution = "https://www.gtexportal.org/home/documentationPage" - parsing_version = '1.2' + parsing_version = '1.3' has_sequence_variants = True # this probably won't change very often - just hard code it for now + # TODO have GTEX dynamically get version and file url (starting point: https://gtexportal.org/api/v2/metadata/dataset) GTEX_VERSION = "8" # tissue name to uberon curies, the tissue names will match gtex file names @@ -248,8 +248,10 @@ def create_edge(self, predicate = "CTD:decreases_expression_of" edge_properties = {'expressed_in': [anatomy_id], - 'p_value': [float(p_value)], - 'slope': [float(slope)]} + P_VALUE: [float(p_value)], + 'slope': [float(slope)], + KNOWLEDGE_LEVEL: PREDICATION, + AGENT_TYPE: COMPUTATIONAL_MODEL} # write out the coalesced edge for the previous group self.output_file_writer.write_edge(subject_id=variant_id, diff --git a/parsers/GWASCatalog/src/loadGWASCatalog.py b/parsers/GWASCatalog/src/loadGWASCatalog.py index 196708b3..1d8c4e8f 100644 --- a/parsers/GWASCatalog/src/loadGWASCatalog.py +++ b/parsers/GWASCatalog/src/loadGWASCatalog.py @@ -1,15 +1,14 @@ import argparse -import logging import os import re import enum from sys import float_info from collections import defaultdict -from Common.utils import LoggingUtil, GetData -from Common.loader_interface import SourceDataLoader, SourceDataBrokenError, SourceDataFailedError +from Common.utils import GetData +from Common.loader_interface import SourceDataLoader from Common.kgxmodel import kgxnode, kgxedge -from Common.node_types import SEQUENCE_VARIANT, DISEASE_OR_PHENOTYPIC_FEATURE, PUBLICATIONS +from Common.biolink_constants import * from Common.prefixes import DBSNP, EFO, ORPHANET, HP, NCIT, MONDO, GO @@ -150,7 +149,9 @@ def parse_data(self) -> dict: # get pubmed id pubmed_id = row[DATACOLS.PUBMEDID.value] - edge_props = {PUBLICATIONS: [f'PMID:{pubmed_id}']} + edge_props = {PUBLICATIONS: [f'PMID:{pubmed_id}'], + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} # get p-value p_value_string = row[DATACOLS.P_VALUE.value] diff --git a/parsers/GenomeAlliance/src/loadGenomeAlliance.py b/parsers/GenomeAlliance/src/loadGenomeAlliance.py index 8e01e6ec..9027f519 100644 --- a/parsers/GenomeAlliance/src/loadGenomeAlliance.py +++ b/parsers/GenomeAlliance/src/loadGenomeAlliance.py @@ -6,7 +6,7 @@ from Common.utils import GetData from Common.loader_interface import SourceDataLoader from Common.extractor import Extractor -from Common.node_types import PRIMARY_KNOWLEDGE_SOURCE +from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE # the data header columns for the orthologs tsv file: diff --git a/parsers/IntAct/src/loadIA.py b/parsers/IntAct/src/loadIA.py index 6ebaf07c..84422394 100644 --- a/parsers/IntAct/src/loadIA.py +++ b/parsers/IntAct/src/loadIA.py @@ -7,6 +7,7 @@ from csv import reader from operator import itemgetter from zipfile import ZipFile +from Common.biolink_constants import * from Common.utils import GetData from Common.loader_interface import SourceDataLoader, SourceDataFailedError from Common.prefixes import NCBITAXON, UNIPROTKB @@ -74,7 +75,7 @@ class IALoader(SourceDataLoader): source_data_url = "https://www.ebi.ac.uk/intact/" license = "https://www.ebi.ac.uk/about/terms-of-use/" attribution = "http://europepmc.org/article/MED/24234451" - parsing_version: str = '1.1' + parsing_version: str = '1.2' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -83,6 +84,9 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) + self.ftp_site = 'ftp.ebi.ac.uk' + self.ftp_dir = '/pub/databases/IntAct/current/psimitab/' + self.data_file: str = 'intact.zip' self.source_db: str = 'IntAct Molecular Interaction Database' @@ -99,7 +103,7 @@ def get_latest_source_version(self) -> str: gd = GetData(self.logger.level) # get the file date - ret_val: str = gd.get_ftp_file_date('ftp.ebi.ac.uk', '/pub/databases/IntAct/current/psimitab/', self.data_file) + ret_val: str = gd.get_ftp_file_date(self.ftp_site, self.ftp_dir, self.data_file) # return to the caller return ret_val @@ -111,14 +115,8 @@ def get_data(self) -> int: """ # get a reference to the data gathering class gd: GetData = GetData(self.logger.level) - - # do the real thing if we arent in debug mode - if not self.test_mode: - file_count: int = gd.pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/IntAct/current/psimitab/', [self.data_file], self.data_path) - else: - file_count: int = 1 - - # return the file count to the caller + file_count: int = gd.pull_via_ftp(self.ftp_site, self.ftp_dir, [self.data_file], + self.data_path) return file_count def parse_data(self) -> dict: @@ -379,7 +377,10 @@ def get_edge_list(self): # add the interacting node edges subject_id = f"{grp_list[grp_idx]['u_a']}" object_id = f"{grp_list[grp_idx]['u_b']}" - edge_props = {"publications": f"{grp_list[grp_idx]['pub_id']}", "detection_method": detection_method} + edge_props = {PUBLICATIONS: grp_list[grp_idx]['pub_id'], + "detection_method": detection_method, + KNOWLEDGE_LEVEL: NOT_PROVIDED, + AGENT_TYPE: NOT_PROVIDED} new_edge = kgxedge(subject_id, object_id, predicate="RO:0002436", @@ -392,10 +393,13 @@ def get_edge_list(self): # add the taxa edges subject_id = f"{grp_list[grp_idx]['u_' + suffix]}" object_id = f"{grp_list[grp_idx]['t_' + suffix]}" + edge_props = {KNOWLEDGE_LEVEL: NOT_PROVIDED, + AGENT_TYPE: NOT_PROVIDED} new_edge = kgxedge(subject_id, object_id, predicate="RO:0002162", - primary_knowledge_source=self.provenance_id) + primary_knowledge_source=self.provenance_id, + edgeprops=edge_props) self.final_edge_list.append(new_edge) # goto the next pair diff --git a/parsers/KinAce/src/loadKinAce.py b/parsers/KinAce/src/loadKinAce.py index 2f8e6a17..4e76c26a 100644 --- a/parsers/KinAce/src/loadKinAce.py +++ b/parsers/KinAce/src/loadKinAce.py @@ -8,6 +8,7 @@ from Common.extractor import Extractor from Common.biolink_constants import * + ############## # Class: Loading kinase-substrate phosphorylation reactions from KinAce # By: Jon-Michael Beasley diff --git a/parsers/LitCoin/src/loadLitCoin.py b/parsers/LitCoin/src/loadLitCoin.py new file mode 100644 index 00000000..851aeba8 --- /dev/null +++ b/parsers/LitCoin/src/loadLitCoin.py @@ -0,0 +1,370 @@ +import time +import os +import json +import re +import requests +from collections import defaultdict + +from Common.biolink_utils import BiolinkUtils +from Common.loader_interface import SourceDataLoader +from Common.biolink_constants import PUBLICATIONS +from Common.utils import GetData, snakify +from Common.normalization import call_name_resolution, NAME_RESOLVER_API_ERROR +from Common.prefixes import PUBMED + + +LLM_SUBJECT_NAME = 'subject' +LLM_SUBJECT_TYPE = 'subject_type' +LLM_OBJECT_NAME = 'object' +LLM_OBJECT_TYPE = 'object_type' +LLM_RELATIONSHIP = 'relationship' +LLM_MAIN_FINDING = 'main_finding' + +LLM_SUBJECT_NAME_EDGE_PROP = 'llm_subject' +LLM_SUBJECT_TYPE_EDGE_PROP = 'llm_subject_type' +LLM_OBJECT_NAME_EDGE_PROP = 'llm_object' +LLM_OBJECT_TYPE_EDGE_PROP = 'llm_object_type' +LLM_RELATIONSHIP_EDGE_PROP = 'llm_relationship' +LLM_ABSTRACT_SUMMARY_EDGE_PROP = 'llm_summary' + +NODE_TYPE_MAPPINGS = { + "Activity": "Activity", + "AnatomicalStructure": "AnatomicalEntity", + "AnatomicalFeature": "AnatomicalEntity", + "Antibody": "ChemicalEntity", + "Behavior": "Behavior", + "BiologicalStructure": "AnatomicalEntity", + "BiologicalPathway": "Pathway", + "CellType": "Cell", + "Chemical": "ChemicalEntity", + "Chemicals": "ChemicalEntity", + "Condition": "PhenotypicFeature", + "Device": "Device", + "Disease": "Disease", + "DiseaseSymptom": "DiseaseOrPhenotypicFeature", + "Drug": "Drug", + "DrugClass": "Drug", + "Gene": "Gene", + "LifestyleFactor": "Behavior", + "Organ": "AnatomicalEntity", + "OrganSystem": "AnatomicalEntity", + "OrganismHuman": "Cohort", + "OrganismHumanEthnicGroup": "PopulationOfIndividualOrganisms", + "OrganismPart": "AnatomicalEntity", + "Organization": "Agent", + "Phenotype": "PhenotypicFeature", + "Procedure": "Procedure", + "Protein": "Protein", + "Proteins": "Protein", + "StatisticalMethod": "Activity", + "Symptom": "PhenotypicFeature", + "Technique": "Procedure", + "Therapy": "Procedure", + "Treatment": "Procedure" +} + + +############## +# Class: LitCoin source loader +# +# Desc: Class that loads/parses the LitCoin data. +############## +class LitCoinLoader(SourceDataLoader): + + source_id: str = 'LitCoin' + provenance_id: str = 'infores:robokop-kg' # TODO - change this to a LitCoin infores when it exists + parsing_version: str = '1.8' + + def __init__(self, test_mode: bool = False, source_data_dir: str = None): + """ + :param test_mode - sets the run into test mode + :param source_data_dir - the specific storage directory to save files in + """ + super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) + + self.data_url = 'https://stars.renci.org/var/data_services/litcoin/' + self.data_file = 'abstracts_CompAndHeal_gpt4_20240320_train.json' + self.data_files = [self.data_file] + # dicts of name to id lookups organized by node type (node_name_to_id_lookup[node_type] = dict of names -> id) + self.node_name_to_id_lookup = defaultdict(dict) + self.name_res_stats = [] + self.bl_utils = BiolinkUtils() + + def get_latest_source_version(self) -> str: + latest_version = 'v1.3' + return latest_version + + def get_data(self) -> bool: + source_data_url = f'{self.data_url}{self.data_file}' + data_puller = GetData() + data_puller.pull_via_http(source_data_url, self.data_path) + return True + + def parse_data(self) -> dict: + """ + Parses the data file for graph nodes/edges + + :return: ret_val: load_metadata + """ + + # could use cached results for faster dev runs with something like this + # with open(os.path.join(self.data_path, "litcoin_name_res_results.json"), "w") as name_res_results_file: + # self.node_name_to_id_lookup = json.load(name_res_results_file) + + records = 0 + skipped_records = 0 + litcoin_file_path: str = os.path.join(self.data_path, self.data_file) + with open(litcoin_file_path) as litcoin_file: + litcoin_json = json.load(litcoin_file) + for litcoin_object in litcoin_json: + pubmed_id = f'{PUBMED}:{litcoin_object["abstract_id"]}' + llm_output = litcoin_object['output'] + for litcoin_edge in self.parse_llm_output(llm_output): + self.logger.info(f'processing edge {records}') + subject_resolution_results = self.process_llm_node(litcoin_edge[LLM_SUBJECT_NAME], + litcoin_edge[LLM_SUBJECT_TYPE]) + if not subject_resolution_results or \ + NAME_RESOLVER_API_ERROR in subject_resolution_results: + skipped_records += 1 + continue + object_resolution_results = self.process_llm_node(litcoin_edge[LLM_OBJECT_NAME], + litcoin_edge[LLM_OBJECT_TYPE]) + if not object_resolution_results or \ + NAME_RESOLVER_API_ERROR in object_resolution_results: + skipped_records += 1 + continue + self.output_file_writer.write_node(node_id=subject_resolution_results['curie'], + node_name=subject_resolution_results['name']) + self.output_file_writer.write_node(node_id=object_resolution_results['curie'], + node_name=object_resolution_results['name']) + + self.output_file_writer.write_node(node_id=pubmed_id, + node_properties={ + 'abstract_summary': litcoin_edge[LLM_ABSTRACT_SUMMARY_EDGE_PROP] + }) + + predicate = 'biolink:' + snakify(litcoin_edge[LLM_RELATIONSHIP]) + edge_properties = { + PUBLICATIONS: [pubmed_id], + LLM_SUBJECT_NAME_EDGE_PROP: litcoin_edge[LLM_SUBJECT_NAME], + LLM_SUBJECT_TYPE_EDGE_PROP: litcoin_edge[LLM_SUBJECT_TYPE], + LLM_OBJECT_NAME_EDGE_PROP: litcoin_edge[LLM_OBJECT_NAME], + LLM_OBJECT_TYPE_EDGE_PROP: litcoin_edge[LLM_OBJECT_TYPE], + LLM_RELATIONSHIP_EDGE_PROP: litcoin_edge[LLM_RELATIONSHIP], + LLM_ABSTRACT_SUMMARY_EDGE_PROP: litcoin_edge[LLM_ABSTRACT_SUMMARY_EDGE_PROP], + # LLM_MAIN_FINDING: litcoin_edge[LLM_MAIN_FINDING] + LLM_MAIN_FINDING: True + } + self.output_file_writer.write_edge(subject_id=subject_resolution_results['curie'], + object_id=object_resolution_results['curie'], + predicate=predicate, + edge_properties=edge_properties) + self.output_file_writer.write_edge(subject_id=subject_resolution_results['curie'], + object_id=pubmed_id, + predicate='biolink:related_to') + self.output_file_writer.write_edge(subject_id=object_resolution_results['curie'], + object_id=pubmed_id, + predicate='biolink:related_to') + records += 1 + if records > 100 and self.test_mode: + break + + # write out name res results alongside the output + with open(os.path.join(self.data_path, "..", + f"parsed_{self.parsing_version}", + "litcoin_name_res_results.json"), "w") as name_res_results_file: + + # include the biolink type used to call name res + for node_type, node_name_to_results_dict in self.node_name_to_id_lookup.items(): + node_type_used_for_name_res = NODE_TYPE_MAPPINGS.get(self.convert_node_type_to_biolink_format(node_type), + None) + for results in node_name_to_results_dict.values(): + if results and NAME_RESOLVER_API_ERROR not in results: + results['queried_type'] = node_type_used_for_name_res + json.dump(self.node_name_to_id_lookup, + name_res_results_file, + indent=4, + sort_keys=True) + + # write name res lookup times + with open(os.path.join(self.data_path, "..", + f"parsed_{self.parsing_version}", + "name_res_timing_litcoin.tsv"), "w") as name_res_timing_file: + name_res_timing_file.writelines(self.name_res_stats) + + parsing_metadata = { + 'records': records, + 'skipped_records': skipped_records + } + return parsing_metadata + + def process_llm_node(self, node_name: str, node_type: str): + + # check if we did name resolution for this name and type already and return it if so + if node_name in self.node_name_to_id_lookup[node_type]: + return self.node_name_to_id_lookup[node_type][node_name] + + # otherwise call the name res service and try to find a match + # the following node_type string formatting conversion is kind of unnecessary now, + # it was intended to produce valid biolink types given the node_type from the llm, + # but that doesn't really work well enough to use, now we use the NODE_TYPE_MAPPINGS mappings, + # but the keys currently use the post-conversion format so this stays for now + biolink_node_type = self.convert_node_type_to_biolink_format(node_type) + preferred_biolink_node_type = NODE_TYPE_MAPPINGS.get(biolink_node_type, None) + self.logger.info(f'calling name res for {node_name} - {preferred_biolink_node_type}') + start_time = time.time() + name_resolution_results = self.name_resolution_function(node_name, preferred_biolink_node_type) + elapsed_time = time.time() - start_time + standardized_name_res_result = self.standardize_name_resolution_results(name_resolution_results) + self.name_res_stats.append(f"{node_name}\t{preferred_biolink_node_type}\t{elapsed_time}\n") + self.node_name_to_id_lookup[node_type][node_name] = standardized_name_res_result + return standardized_name_res_result + + def convert_node_type_to_biolink_format(self, node_type): + try: + biolink_node_type = re.sub("[()/]", "", node_type) # remove parentheses and forward slash + biolink_node_type = "".join([node_type_segment[0].upper() + node_type_segment[1:].lower() + for node_type_segment in biolink_node_type.split()]) # force Pascal case + return f'{biolink_node_type}' + except TypeError as e: + self.logger.error(f'Bad node type provided by llm: {node_type}') + return "" + + def parse_llm_output(self, llm_output): + + # this attempts to extract the abstract summary from the llm output, + # it's not great because the summary is not inside the json part of the output + abstract_summary = "Summary not provided or could not be parsed." + if "Summary:" in llm_output: + if "\n\nBiological Entities:" in llm_output: + abstract_summary = llm_output.split("Summary: ")[1].split("Biological Entities:")[0].strip() + elif "\n\nEntities:" in llm_output: + abstract_summary = llm_output.split("Summary: ")[1].split("Entities:")[0].strip() + + # the rest of the logic is from Miles at CoVar, it parses the current format of output from the llm + required_fields = [LLM_SUBJECT_NAME, + LLM_SUBJECT_TYPE, + LLM_OBJECT_NAME, + LLM_OBJECT_TYPE, + LLM_RELATIONSHIP + ] + matches = re.findall(r'\{([^\}]*)\}', llm_output) + valid_responses = [] + for match in matches: + cur_response = '{' + match + '}' + try: + cur_response_dict = json.loads(cur_response) + cur_response_dict[LLM_ABSTRACT_SUMMARY_EDGE_PROP] = abstract_summary + except json.decoder.JSONDecodeError as e: + self.logger.error(f'Error decoding JSON: {e}') + continue + for field in required_fields: + if field not in cur_response_dict: + self.logger.warning(f'Missing field {field} in response: {cur_response_dict}') + break + if not isinstance(cur_response_dict[field], str): + self.logger.warning(f'Non-string field {field} in response: {cur_response_dict}') + break + else: # only add the fields which have all the fields + valid_responses.append(cur_response_dict) + return valid_responses + + def name_resolution_function(self, node_name, preferred_biolink_node_type, retries=0): + return call_name_resolution(node_name, + preferred_biolink_node_type, + retries, logger=self.logger) + + def standardize_name_resolution_results(self, name_res_json): + if not name_res_json: + return {} + elif NAME_RESOLVER_API_ERROR in name_res_json: + return name_res_json + return { + "curie": name_res_json['curie'], + "name": name_res_json['label'], + "types": list(self.bl_utils.find_biolink_leaves(set(name_res_json['types']))), + "score": name_res_json['score'] + } + + +class LitCoinSapBERTLoader(LitCoinLoader): + source_id: str = 'LitCoinSapBERT' + parsing_version: str = '1.6' + + def name_resolution_function(self, node_name, preferred_biolink_node_type, retries=0): + sapbert_url = 'https://babel-sapbert.apps.renci.org/annotate/' + sapbert_payload = { + "text": node_name, + "model_name": "sapbert", + "count": 1000, + "args": {"bl_type": preferred_biolink_node_type} + } + sapbert_response = requests.post(sapbert_url, json=sapbert_payload) + if sapbert_response.status_code == 200: + sapbert_json = sapbert_response.json() + # return the first result if there is one + if sapbert_json: + return sapbert_json[0] + else: + error_message = f'Non-200 Sapbert result {sapbert_response.status_code} for request {sapbert_payload}.' + if retries < 3: + self.logger.error(error_message + f' Retrying (attempt {retries + 1})... ') + return self.name_resolution_function(node_name, preferred_biolink_node_type, retries + 1) + else: + self.logger.error(error_message + f' Giving up...') + # if no results return None + return None + + def standardize_name_resolution_results(self, name_res_json): + if not name_res_json: + return None + return { + "curie": name_res_json['curie'], + "name": name_res_json['name'], + "types": [name_res_json['category']], + "score": name_res_json['score'] + } + + +class LitCoinEntityExtractorLoader(LitCoinLoader): + source_id: str = 'LitCoinEntityExtractor' + parsing_version: str = '1.3' + + def parse_data(self) -> dict: + litcoin_file_path: str = os.path.join(self.data_path, self.data_file) + all_entities = {} + with open(litcoin_file_path) as litcoin_file: + litcoin_json = json.load(litcoin_file) + for litcoin_object in litcoin_json: + abstract_id = litcoin_object['abstract_id'] + llm_output = litcoin_object['output'] + for litcoin_edge in self.parse_llm_output(llm_output): + + subject_name = litcoin_edge[LLM_SUBJECT_NAME] + subject_type = litcoin_edge[LLM_SUBJECT_TYPE] + subject_mapped_type = NODE_TYPE_MAPPINGS.get(self.convert_node_type_to_biolink_format(subject_type), + None) + all_entities[f'{subject_name}{subject_type}'] = {'name': subject_name, + 'llm_type': subject_type, + 'name_res_type': subject_mapped_type, + 'abstract_id': abstract_id} + object_name = litcoin_edge[LLM_OBJECT_NAME] + object_type = litcoin_edge[LLM_OBJECT_TYPE] + object_mapped_type = NODE_TYPE_MAPPINGS.get(self.convert_node_type_to_biolink_format(object_type), + None) + all_entities[f'{object_name}{object_type}'] = {'name': object_name, + 'llm_type': object_type, + 'name_res_type': object_mapped_type, + 'abstract_id': abstract_id} + + with open(os.path.join(self.data_path, "..", + f"parsed_{self.parsing_version}", + "name_res_inputs.csv"), "w") as name_res_inputs: + name_res_inputs.write("query,llm_type,biolink_type,abstract_id\n") + for entity in all_entities.values(): + name_res_inputs.write(f'"{entity["name"]}","{entity["llm_type"]}",{entity["name_res_type"]},{entity["abstract_id"]}\n') + self.logger.info(f'{len(all_entities.values())} unique entities extracted') + return {} + + + diff --git a/parsers/PHAROS/src/loadPHAROS.py b/parsers/PHAROS/src/loadPHAROS.py index a36277d4..87e6dc57 100644 --- a/parsers/PHAROS/src/loadPHAROS.py +++ b/parsers/PHAROS/src/loadPHAROS.py @@ -5,7 +5,7 @@ from Common.loader_interface import SourceDataLoader, SourceDataBrokenError, SourceDataFailedError from Common.kgxmodel import kgxnode, kgxedge -from Common.node_types import GENE, DISEASE_OR_PHENOTYPIC_FEATURE, PUBLICATIONS +from Common.biolink_constants import * from Common.utils import GetData, snakify from Common.db_connectors import MySQLConnector from Common.predicates import DGIDB_PREDICATE_MAPPING @@ -19,7 +19,7 @@ class PHAROSLoader(SourceDataLoader): source_data_url = "https://pharos.nih.gov/" license = "Data accessed from Pharos and TCRD is publicly available from the primary sources listed above. Please respect their individual licenses regarding proper use and redistribution." attribution = 'Sheils, T., Mathias, S. et al, "TCRD and Pharos 2021: mining the human proteome for disease biology", Nucl. Acids Res., 2021. DOI: 10.1093/nar/gkaa993' - parsing_version: str = '1.5' + parsing_version: str = '1.7' GENE_TO_DISEASE_QUERY: str = """select distinct x.value, d.did, d.name, p.sym, d.dtype, d.score from disease d @@ -62,6 +62,26 @@ class PHAROSLoader(SourceDataLoader): 'UniProt Disease': 'infores:uniprot' } + # we might want more granularity here but for now it's one-to-one source with KL/AT + # we will need to develop a procedure for merging KL/AT moving forward + PHAROS_KL_AT_lookup = { + 'CTD': (PREDICATION, MANUAL_AGENT), + 'DisGeNET': (NOT_PROVIDED, NOT_PROVIDED), + 'DrugCentral Indication': (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + 'eRAM': (NOT_PROVIDED, NOT_PROVIDED), + # For more information about JensenLab Databases: DOI: https://doi.org/10.1093/database/baac019 + 'JensenLab Experiment TIGA': (PREDICATION, AUTOMATED_AGENT), + 'JensenLab Knowledge AmyCo': (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + 'JensenLab Knowledge MedlinePlus': (KNOWLEDGE_ASSERTION, MANUAL_AGENT), + 'JensenLab Knowledge UniProtKB-KW': (KNOWLEDGE_ASSERTION, MANUAL_VALIDATION_OF_AUTOMATED_AGENT), + 'JensenLab Text Mining': (NOT_PROVIDED, TEXT_MINING_AGENT), + 'Monarch': (NOT_PROVIDED, NOT_PROVIDED), + 'UniProt Disease': (KNOWLEDGE_ASSERTION, MANUAL_AGENT) + } + + # these are used for pharos edges which are not from a further upstream source + PHAROS_KL_AT = (KNOWLEDGE_ASSERTION, MANUAL_AGENT) + def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ :param test_mode - sets the run into test mode @@ -74,6 +94,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): self.source_db = 'Target Central Resource Database' self.pharos_db = None self.genetic_association_predicate = 'WIKIDATA_PROPERTY:P2293' + self.target_for_predicate = "biolink:target_for" def get_latest_source_version(self) -> str: """ @@ -81,11 +102,15 @@ def get_latest_source_version(self) -> str: :return: the version of the data """ + """ + we could do something like this but PHAROS is not going to be updated in this same format so it's not helpful url = 'http://juniper.health.unm.edu/tcrd/download/latest.README' response = requests.get(url) first_line = response.text.splitlines()[0] version = first_line.split()[1].replace('.', '_') return version + """ + return 'v6_13_4' def get_data(self): gd: GetData = GetData(self.logger.level) @@ -162,7 +187,11 @@ def parse_gene_to_disease(self) -> (int, int): disease_id = item['did'] disease_name = self.sanitize_name(item['name']) edge_provenance = self.PHAROS_INFORES_MAPPING[item['dtype']] - edge_properties = {'score': float(item['score'])} if item['score'] else {} + knowledge_level, agent_type = self.PHAROS_KL_AT_lookup.get(item['dtype'], (NOT_PROVIDED, NOT_PROVIDED)) + edge_properties = {KNOWLEDGE_LEVEL: knowledge_level, + AGENT_TYPE: agent_type} + if item['score']: + edge_properties['score'] = float(item['score']) # move along, no disease id if disease_id is None: @@ -188,9 +217,13 @@ def parse_gene_to_disease(self) -> (int, int): self.output_file_writer.write_kgx_node(gene_node) if edge_provenance: + if edge_provenance == "infores:drugcentral": + assigned_predicate = self.target_for_predicate + else: + assigned_predicate = self.genetic_association_predicate gene_to_disease_edge = kgxedge(subject_id=gene_id, object_id=disease_id, - predicate=self.genetic_association_predicate, + predicate=assigned_predicate, edgeprops=edge_properties, primary_knowledge_source=edge_provenance, aggregator_knowledge_sources=[self.provenance_id]) @@ -346,8 +379,11 @@ def get_edge_props(self, result) -> (str, list, dict, str): else: pmids: list = [] + knowledge_level, agent_type = self.PHAROS_KL_AT + props: dict = {KNOWLEDGE_LEVEL: knowledge_level, + AGENT_TYPE: agent_type} + # if there was affinity data save it - props: dict = {} affinity = result['affinity'] if affinity is not None and affinity != '': props['affinity'] = float(affinity) diff --git a/parsers/Reactome/src/loadReactome.py b/parsers/Reactome/src/loadReactome.py index 19b638dd..a816269b 100755 --- a/parsers/Reactome/src/loadReactome.py +++ b/parsers/Reactome/src/loadReactome.py @@ -9,7 +9,7 @@ from Common.loader_interface import SourceDataLoader from Common.kgxmodel import kgxnode, kgxedge from Common.neo4j_tools import Neo4jTools -from Common.node_types import MACROMOLECULAR_COMPLEX, NAMED_THING +from Common.biolink_constants import * from Common.prefixes import REACTOME, NCBITAXON, GTOPDB, UNIPROTKB, CHEBI, KEGG_COMPOUND, KEGG_GLYCAN, PUBCHEM_COMPOUND, NCBIGENE, CLINVAR from Common.utils import GetData @@ -99,7 +99,7 @@ class ReactomeLoader(SourceDataLoader): source_data_url = "https://reactome.org/" license = "https://reactome.org/license" attribution = "https://academic.oup.com/nar/article/50/D1/D687/6426058?login=false" - parsing_version = '1.2' + parsing_version = '1.3' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -109,9 +109,12 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) self.version_url: str = 'https://reactome.org/about/news' + # we'll rename the neo4j dump as we download it to make neo4j usage easier + # (community edition only allows one database, having just one named 'neo4j' helps) self.neo4j_dump_file = 'reactome.graphdb.dump' + self.saved_neo4j_dump_file = 'neo4j.dump' self.data_url = 'https://reactome.org/download/current/' - self.data_files = [self.neo4j_dump_file] + self.data_files = [self.saved_neo4j_dump_file] self.triple_file: str = 'reactomeContents_CriticalTriples.csv' self.triple_path = os.path.dirname(os.path.abspath(__file__)) @@ -142,15 +145,25 @@ def get_latest_source_version(self) -> str: def get_data(self) -> bool: gd: GetData = GetData(self.logger.level) - for dt_file in self.data_files: - gd.pull_via_http(f'{self.data_url}{dt_file}', - self.data_path) + gd.pull_via_http(f'{self.data_url}{self.neo4j_dump_file}', + self.data_path, saved_file_name=self.saved_neo4j_dump_file) return True def parse_data(self): neo4j_tools = Neo4jTools() - neo4j_tools.load_backup_dump(f'{self.data_path}/{self.neo4j_dump_file}') - neo4j_tools.start_neo4j() + + neo4j_status_code = neo4j_tools.load_backup_dump(f'{self.data_path}/') + if neo4j_status_code: + raise SystemError('Neo4j failed to load the backup dump.') + + neo4j_status_code = neo4j_tools.migrate_dump_to_neo4j_5() + if neo4j_status_code: + raise SystemError('Neo4j failed to migrate the dump to neo4j 5.') + + neo4j_status_code = neo4j_tools.start_neo4j() + if neo4j_status_code: + raise SystemError('Neo4j failed to start.') + neo4j_tools.wait_for_neo4j_initialization() neo4j_driver = neo4j_tools.neo4j_driver @@ -424,12 +437,16 @@ def process_edge_from_neo4j(self, complex_context=None): predicate = PREDICATE_MAPPING.get(relationship_type, None) if predicate: + edge_properties = {KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} + if complex_context: + edge_properties['complex_context'] = complex_context if not regulation_type: output_edge = kgxedge( subject_id=subject_id, object_id=object_id, predicate=predicate, - edgeprops={'complex_context': complex_context} if complex_context else None, + edgeprops=edge_properties, primary_knowledge_source=self.provenance_id ) else: @@ -440,18 +457,16 @@ def process_edge_from_neo4j(self, else: self.logger.warning(f'Unexpected regulation type encountered: {regulation_type}') return - edge_props = { - 'qualified_predicate': 'causes', - 'object_aspect_qualifier': 'expression', - 'object_direction_qualifier': direction, - } - if complex_context: - edge_props['complex_context'] = complex_context + edge_properties.update({ + QUALIFIED_PREDICATE: 'biolink:causes', + OBJECT_ASPECT_QUALIFIER: 'expression', + OBJECT_DIRECTION_QUALIFIER: direction, + }) output_edge = kgxedge( subject_id=subject_id, object_id=object_id, predicate=predicate, - edgeprops=edge_props, + edgeprops=edge_properties, primary_knowledge_source=self.provenance_id ) self.output_file_writer.write_kgx_edge(output_edge) diff --git a/parsers/SGD/src/loadSGD.py b/parsers/SGD/src/loadSGD.py index b49a446e..2c82c9f4 100644 --- a/parsers/SGD/src/loadSGD.py +++ b/parsers/SGD/src/loadSGD.py @@ -6,7 +6,7 @@ from Common.loader_interface import SourceDataLoader from Common.extractor import Extractor from Common.prefixes import PUBMED -from Common.node_types import PRIMARY_KNOWLEDGE_SOURCE, NODE_TYPES, PUBLICATIONS +from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE, NODE_TYPES, PUBLICATIONS from parsers.yeast.src.yeast_constants import SGD_ALL_GENES_FILE # Maps Genes to GO Terms. @@ -18,7 +18,7 @@ class GENEGOTERMS_EDGEUMAN(enum.IntEnum): EVIDENCECODE = 8 EVIDENCECODETEXT = 10 ANNOTATIONTYPE = 12 - EVIDENCEPMID = 13 + EVIDENCEPMID = 15 # Maps Genes to Pathways class GENEPATHWAYS_EDGEUMAN(enum.IntEnum): diff --git a/parsers/SGD/src/sgd_source_retriever.py b/parsers/SGD/src/sgd_source_retriever.py index a702f1f7..89e25939 100644 --- a/parsers/SGD/src/sgd_source_retriever.py +++ b/parsers/SGD/src/sgd_source_retriever.py @@ -18,17 +18,17 @@ def SGDGene2GOTerm(data_directory): print( "---------------------------------------------------\nCollecting all GO Annotation data for all genes on SGD...\n---------------------------------------------------\n") view = ["primaryIdentifier", "secondaryIdentifier", "symbol", "featureType", - "qualifier", "goAnnotation.ontologyTerm.identifier", + "goAnnotation.ontologyTerm.identifier", "goAnnotation.ontologyTerm.name", "goAnnotation.ontologyTerm.namespace", - "goAnnotation.evidence.code.code", "goAnnotation.qualifier", + "goAnnotation.evidence.code.code", "goAnnotation.qualifier", "col1", "goAnnotation.evidence.code.withText", "goAnnotation.annotationExtension", - "goAnnotation.evidence.code.annotType", - "goAnnotation.evidence.publications.pubMedId", + "goAnnotation.evidence.code.annotType", "goAnnotation.parentsIdentifier", + "goAnnotation.parentsName", "goAnnotation.evidence.publications.pubMedId", "goAnnotation.evidence.publications.citation"] # Request all gene2GOTerm data. rqgene2goterm = rq.get( - f"https://yeastmine.yeastgenome.org/yeastmine/service/template/results?name=Gene_GO&constraint1=Gene&op1=LOOKUP&value1=**&extra1=&format=csv") + f"https://yeastmine.yeastgenome.org/yeastmine/service/template/results?name=GoSlimTerm_Gene&constraint1=Gene.goAnnotation.ontologyTerm.parents.name&op1=eq&value1=**&extra1&format=csv") # Parse as CSV object. lines = rqgene2goterm.text.splitlines() @@ -48,7 +48,6 @@ def SGDGene2GOTerm(data_directory): gene2gotermdf.fillna("?", inplace=True) print('SGD gene2goterm Data Collected!') print(os.path.join(storage_dir, csv_fname)) - # gene2gotermdf.to_csv(f"//ORION/parsers/yeast/src/{csv_fname}", encoding="utf-8-sig", index=False) gene2gotermdf.to_csv(os.path.join(storage_dir, csv_fname), encoding="utf-8-sig", index=False) diff --git a/parsers/STRING/src/loadSTRINGDB.py b/parsers/STRING/src/loadSTRINGDB.py index 0ac865c6..93ec725c 100644 --- a/parsers/STRING/src/loadSTRINGDB.py +++ b/parsers/STRING/src/loadSTRINGDB.py @@ -7,7 +7,7 @@ from Common.loader_interface import SourceDataLoader from Common.extractor import Extractor from Common.prefixes import ENSEMBL, NCBITAXON -from Common.node_types import PRIMARY_KNOWLEDGE_SOURCE +from Common.biolink_constants import * # Full PPI Data. @@ -148,8 +148,10 @@ def parse_data(self) -> dict: "Textmining_transferred":line[PPI_EDGEUMAN.TEXTMINING_TRANSFERRED.value], "Cooccurance":line[PPI_EDGEUMAN.COOCCURANCE.value], "Combined_score":line[PPI_EDGEUMAN.COMBINED_SCORE.value], - "species_context_qualifier": f"{NCBITAXON}:{self.taxon_id}", - PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id + SPECIES_CONTEXT_QUALIFIER: f"{NCBITAXON}:{self.taxon_id}", + PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id, + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT }, # edge props comment_character=None, delim=" ", @@ -171,8 +173,10 @@ def parse_data(self) -> dict: "Textmining_transferred":line[PPI_EDGEUMAN.TEXTMINING_TRANSFERRED.value], "Cooccurance":line[PPI_EDGEUMAN.COOCCURANCE.value], "Combined_score":line[PPI_EDGEUMAN.COMBINED_SCORE.value], - "species_context_qualifier": f"{NCBITAXON}:{self.taxon_id}", - PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id + SPECIES_CONTEXT_QUALIFIER: f"{NCBITAXON}:{self.taxon_id}", + PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id, + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT }, # edge props comment_character=None, delim=" ", @@ -195,8 +199,10 @@ def parse_data(self) -> dict: "Textmining_transferred":line[PPI_EDGEUMAN.TEXTMINING_TRANSFERRED.value], "Cooccurance":line[PPI_EDGEUMAN.COOCCURANCE.value], "Combined_score":line[PPI_EDGEUMAN.COMBINED_SCORE.value], - "species_context_qualifier": f"{NCBITAXON}:{self.taxon_id}", - PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id + SPECIES_CONTEXT_QUALIFIER: f"{NCBITAXON}:{self.taxon_id}", + PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id, + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT }, # edge props comment_character=None, delim=" ", @@ -220,8 +226,10 @@ def parse_data(self) -> dict: "Textmining_transferred":line[PPI_EDGEUMAN.TEXTMINING_TRANSFERRED.value], "Cooccurance":line[PPI_EDGEUMAN.COOCCURANCE.value], "Combined_score":line[PPI_EDGEUMAN.COMBINED_SCORE.value], - "species_context_qualifier": f"{NCBITAXON}:{self.taxon_id}", - PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id + SPECIES_CONTEXT_QUALIFIER: f"{NCBITAXON}:{self.taxon_id}", + PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id, + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT }, # edge props comment_character=None, delim=" ", @@ -246,7 +254,9 @@ def parse_data(self) -> dict: "Textmining_transferred":line[PPI_PHYSICAL_EDGEUMAN.TEXTMINING_TRANSFERRED.value], "Combined_score":line[PPI_PHYSICAL_EDGEUMAN.COMBINED_SCORE.value], "species_context_qualifier": f"{NCBITAXON}:{self.taxon_id}", - PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id + PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id, + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT }, # edge props comment_character=None, delim=" ", @@ -257,11 +267,11 @@ def parse_data(self) -> dict: class HumanSTRINGDBLoader(STRINGDBLoader): source_id: str = 'STRING-DB-Human' - parsing_version = '1.1' + parsing_version = '1.2' taxon_id: str = '9606' # Human taxon class YeastSTRINGDBLoader(STRINGDBLoader): source_id: str = 'STRING-DB-Yeast' - parsing_version = '1.1' + parsing_version = '1.2' taxon_id: str = '4932' # Saccharomyces cerevisiae taxon diff --git a/parsers/UberGraph/src/loadUG.py b/parsers/UberGraph/src/loadUG.py index d6578c85..4ac34bb5 100644 --- a/parsers/UberGraph/src/loadUG.py +++ b/parsers/UberGraph/src/loadUG.py @@ -4,7 +4,7 @@ from io import TextIOWrapper from Common.utils import GetData from Common.loader_interface import SourceDataLoader -# from Common.node_types import DESCRIPTION +from Common.biolink_constants import KNOWLEDGE_LEVEL, AGENT_TYPE, KNOWLEDGE_ASSERTION, MANUAL_AGENT from parsers.UberGraph.src.ubergraph import UberGraphTools @@ -91,12 +91,15 @@ def parse_data(self): # object_description = ubergraph_tools.node_descriptions.get(object_curie, None) # object_properties = {DESCRIPTION: object_description} if object_description else {} + edge_props = {KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} self.output_file_writer.write_node(node_id=subject_curie) self.output_file_writer.write_node(node_id=object_curie) self.output_file_writer.write_edge(subject_id=subject_curie, object_id=object_curie, predicate=predicate_curie, - primary_knowledge_source=self.provenance_id) + primary_knowledge_source=self.provenance_id, + edge_properties=edge_props) if self.test_mode and record_counter == 10_000: break diff --git a/parsers/UberGraph/src/ubergraph.py b/parsers/UberGraph/src/ubergraph.py index 24b26a09..4f8a7361 100644 --- a/parsers/UberGraph/src/ubergraph.py +++ b/parsers/UberGraph/src/ubergraph.py @@ -2,17 +2,7 @@ import curies import tarfile from io import TextIOWrapper - - -BIOLINK_DUPLICATE_MAPPINGS = ["agrkb", - "OBOREL", - "oboInOwl", - "oboformat"] - -BIOLINK_MAPPING_CHANGES = { - 'KEGG': 'http://identifiers.org/kegg/', - 'NCBIGene': 'https://identifiers.org/ncbigene/' -} +from Common.biolink_utils import get_biolink_prefix_map OBO_MISSING_MAPPINGS = { 'NCBIGene': 'http://purl.obolibrary.org/obo/NCBIGene_', @@ -40,7 +30,6 @@ def __init__(self, self.node_descriptions = self.get_node_descriptions(ubergraph_url) if load_node_descriptions else None - def convert_iris_to_curies(self): if self.logger: self.logger.info(f'Converting all Ubergraph iris to curies..') @@ -73,11 +62,11 @@ def convert_iris_to_curies(self): if node_mapping_failures: self.logger.info(f'Node conversion failure examples: {node_mapping_failures[:10]}') self.logger.info(f'Edges: {len(self.edge_curies)} successfully converted, {len(edge_mapping_failures)} failures.') - if node_mapping_failures: + if edge_mapping_failures: self.logger.info(f'Edge conversion failure examples: {edge_mapping_failures[:10]}') def init_curie_converter(self): - biolink_prefix_map = self.get_biolink_prefix_map() + biolink_prefix_map = get_biolink_prefix_map() iri_to_biolink_curie_converter = curies.Converter.from_prefix_map(biolink_prefix_map) iri_to_obo_curie_converter = curies.get_obo_converter() custom_converter = curies.Converter.from_prefix_map(OBO_MISSING_MAPPINGS) @@ -122,28 +111,6 @@ def get_curie_for_node_id(self, node_id): def get_curie_for_edge_id(self, edge_id): return self.edge_curies[edge_id] - @staticmethod - def get_biolink_prefix_map(): - # TODO - ideally this would be a specific version of the biolink model, that's not supported by parsers yet - response = requests.get('https://raw.githubusercontent.com/biolink/biolink-model/master/project/prefixmap/biolink_model_prefix_map.json') - if response.status_code != 200: - response.raise_for_status() - - biolink_prefix_map = response.json() - - for duplicate_mapping in BIOLINK_DUPLICATE_MAPPINGS: - if duplicate_mapping in biolink_prefix_map: - del (biolink_prefix_map[duplicate_mapping]) - kegg_keys = [] - for key, value in biolink_prefix_map.items(): - if 'KEGG.' in key: - kegg_keys.append(key) - for key in kegg_keys: - del (biolink_prefix_map[key]) - - biolink_prefix_map.update(BIOLINK_MAPPING_CHANGES) - return biolink_prefix_map - @staticmethod def get_latest_source_version(ubergraph_url: str): sparql_url = f'{ubergraph_url}/sparql' diff --git a/parsers/ViralProteome/src/loadUniRef.py b/parsers/ViralProteome/src/loadUniRef.py index 7bd0d900..efdbbc41 100644 --- a/parsers/ViralProteome/src/loadUniRef.py +++ b/parsers/ViralProteome/src/loadUniRef.py @@ -90,6 +90,7 @@ def get_uniref_data(self) -> set: # are we in test mode if not self.test_mode: # get the list of taxa + #TODO: It looks like gd.get_ncbi_taxon_id_set doesn't resolve. It was removed in https://github.com/RobokopU24/ORION/commit/d3860356f2dac5779d1c15d651e644921dc48f88 target_taxon_set: set = gd.get_ncbi_taxon_id_set(self.data_path, self.TYPE_VIRUS) else: # create a test set of target taxa diff --git a/parsers/ViralProteome/src/loadVP.py b/parsers/ViralProteome/src/loadVP.py index 59ca74d1..8c22170c 100644 --- a/parsers/ViralProteome/src/loadVP.py +++ b/parsers/ViralProteome/src/loadVP.py @@ -25,7 +25,7 @@ class VPLoader(SourceDataLoader): source_data_url = "https://www.ebi.ac.uk/GOA/proteomes" license = "https://www.ebi.ac.uk/about/terms-of-use" attribution = "https://www.ebi.ac.uk/about/terms-of-use" - parsing_version: str = '1.1' + parsing_version: str = '1.2' # organism types TYPE_BACTERIA: str = '0' diff --git a/parsers/_parser_template/src/parser.py b/parsers/_parser_template/src/parser.py index 866adc13..61aec9c4 100644 --- a/parsers/_parser_template/src/parser.py +++ b/parsers/_parser_template/src/parser.py @@ -5,7 +5,7 @@ from Common.extractor import Extractor from Common.loader_interface import SourceDataLoader -from Common.node_types import PRIMARY_KNOWLEDGE_SOURCE +from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE from Common.prefixes import HGNC # only an example, use existing curie prefixes or add your own to the prefixes file from Common.utils import GetData diff --git a/parsers/camkp/src/loadCAMKP.py b/parsers/camkp/src/loadCAMKP.py index b28c5aa3..c8fb0803 100644 --- a/parsers/camkp/src/loadCAMKP.py +++ b/parsers/camkp/src/loadCAMKP.py @@ -7,7 +7,7 @@ from Common.utils import GetData from Common.kgxmodel import kgxnode, kgxedge -from Common.node_types import ROOT_ENTITY, XREFS +from Common.biolink_constants import XREFS, KNOWLEDGE_LEVEL, KNOWLEDGE_ASSERTION, AGENT_TYPE, MANUAL_AGENT from Common.loader_interface import SourceDataLoader from gzip import GzipFile @@ -31,11 +31,12 @@ class CAMKPLoader(SourceDataLoader): source_id: str = "CAMKP" provenance_id: str = "infores:go-cam" + aggregator_knowledge_source: str = "infores:cam-kp" description = "CAMs (Causal Activity Models) are small knowledge graphs built using the Web Ontology Language (OWL). The CAM database combines many CAM graphs along with a large merged bio-ontology containing the full vocabulary of concepts referenced within the individual CAMs. Each CAM describes an instantiation of some of those concepts in a particular context, modeling the interactions between those instances as an interlinked representation of a complex biological or environmental process." source_data_url = "https://github.com/ExposuresProvider/cam-kp-api" license = "https://github.com/ExposuresProvider/cam-kp-api/blob/master/LICENSE" attribution = "https://github.com/ExposuresProvider/cam-kp-api" - parsing_version = "1.1" + parsing_version = "1.3" def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -86,44 +87,36 @@ def parse_data(self) -> dict: line = lines.strip().split('\t') subject_id = self.sanitize_cam_node_id(line[CAMDATACOLS.SUBJECT_ID.value]) - subject_node = kgxnode(subject_id, - name='', - categories=[ROOT_ENTITY], - nodeprops=None) + subject_node = kgxnode(subject_id) self.output_file_writer.write_kgx_node(subject_node) object_id = self.sanitize_cam_node_id(line[CAMDATACOLS.OBJECT_ID.value]) - object_node = kgxnode(object_id, - name='', - categories=[ROOT_ENTITY], - nodeprops=None) - + object_node = kgxnode(object_id) self.output_file_writer.write_kgx_node(object_node) predicate = line[CAMDATACOLS.PREDICATE.value] edge_provenance_id = line[CAMDATACOLS.PROVENANCE_ID.value] edge_provenance_url = line[CAMDATACOLS.PROVENANCE_URL.value] - edge_properties = {} + edge_properties = { + XREFS: [edge_provenance_url], + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT + } if len(line) >= CAMDATACOLS.QUALIFIERS.value + 1: qualifier_json_strings = line[CAMDATACOLS.QUALIFIERS.value].split('||') edge_properties = {k.replace('biolink:', ''): v for json_item in qualifier_json_strings for k, v in json.loads(json_item).items()} - edge_properties[XREFS] = [edge_provenance_url] new_edge = kgxedge(subject_id=subject_id, object_id=object_id, predicate=predicate, primary_knowledge_source=edge_provenance_id, + aggregator_knowledge_sources=[self.aggregator_knowledge_source], edgeprops=edge_properties) self.output_file_writer.write_kgx_edge(new_edge) - record_counter += 1 - self.logger.debug(f'Parsing data file complete.') - load_metadata: dict = { - 'num_source_lines': record_counter, - 'unusable_source_lines': skipped_record_counter - } - + load_metadata: dict = {'num_source_lines': record_counter, + 'unusable_source_lines': skipped_record_counter} return load_metadata def sanitize_cam_node_id(self, node_id): diff --git a/parsers/clinicaltrials/src/loadCTKP.py b/parsers/clinicaltrials/src/loadCTKP.py new file mode 100644 index 00000000..4b95ba1f --- /dev/null +++ b/parsers/clinicaltrials/src/loadCTKP.py @@ -0,0 +1,222 @@ +import enum +import os +import requests +import json + +from Common.biolink_constants import * +from Common.extractor import Extractor +from Common.utils import GetData +from Common.loader_interface import SourceDataLoader +from Common.utils import GetDataPullError + + +# the data header columns the nodes files are: +class NODESDATACOLS(enum.IntEnum): + ID = 0 + NAME = 1 + CATEGORY = 2 + + +# the data header columns for the edges file are: +class EDGESDATACOLS(enum.IntEnum): + ID = 0 + SUBJECT = 1 + PREDICATE = 2 + OBJECT = 3 + SUBJECT_NAME = 4 + OBJECT_NAME = 5 + CATEGORY = 6 + KNOWLEDGE_LEVEL = 7 + AGENT_TYPE = 8 + NCTID = 9 + PHASE = 10 + PRIMARY_PURPOSE = 11 + INTERVENTION_MODEL = 12 + TIME_PERSPECTIVE = 13 + OVERALL_STATUS = 14 + START_DATE = 15 + ENROLLMENT = 16 + ENROLLMENT_TYPE = 17 + AGE_RANGE = 18 + CHILD = 19 + ADULT = 20 + OLDER_ADULT = 21 + UNII = 22 + + +class CTKPLoader(SourceDataLoader): + source_id: str = "ClinicalTrialsKP" + provenance_id: str = "infores:biothings-multiomics-clinicaltrials" + description = "The Clinical Trials KP, created and maintained by the Multiomics Provider, provides information on Clinical Trials, ultimately derived from researcher submissions to clinicaltrials.gov, via the Aggregate Analysis of Clinical Trials (AACT) database). Information on select trials includes the NCT Identifier of the trial, interventions used, diseases/conditions relevant to the trial, adverse events, etc." + source_data_url = "https://aact.ctti-clinicaltrials.org/" + license = "https://github.com/ctti-clinicaltrials/aact/blob/dev/LICENSE" + attribution = "" + parsing_version = "1.0" + + def __init__(self, test_mode: bool = False, source_data_dir: str = None): + """ + :param test_mode - sets the run into test mode + :param source_data_dir - the specific storage directory to save files in + """ + super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) + + # until we can use the manifest to determine versions and source data file locations we'll hard code it + self.node_file_name = 'clinical_trials_kg_nodes_v2.2.10.tsv' + self.edge_file_name = 'clinical_trials_kg_edges_v2.2.10.tsv' + self.data_url = "https://db.systemsbiology.net/gestalt/KG/" + + # once we use the manifest, we'll rename the files while downloading and they can be called something generic + # self.node_file_name = 'nodes.tsv' + # self.edge_file_name = 'edges.tsv' + + self.data_files = [ + self.node_file_name, + self.edge_file_name + ] + + self.aact_infores = "infores:aact" + self.ctgov_infores = "infores:clinicaltrials" + self.treats_predicate = "biolink:treats" + self.source_record_url = "https://db.systemsbiology.net/gestalt/cgi-pub/KGinfo.pl?id=" + + def get_latest_source_version(self) -> str: + latest_version = "2.2.10" + # we'd like to do this but for now we're using the dev version which is not in the manifest + # latest_version = self.get_manifest()['version'] + return latest_version + + @staticmethod + def get_manifest(): + manifest_response = requests.get('https://github.com/multiomicsKP/clinical_trials_kp/blob/main/manifest.json') + if manifest_response.status_code == 200: + manifest = manifest_response.json() + return manifest + else: + manifest_response.raise_for_status() + + def get_data(self) -> int: + """ + manifest = self.get_manifest() + source_data_urls = manifest['dumper']['data_url'] + nodes_url = None + edges_url = None + for data_url in source_data_urls: + if 'nodes' in data_url: + nodes_url = data_url + elif 'edges' in data_url: + edges_url = data_url + if not nodes_url and edges_url: + raise GetDataPullError(f'Could not determine nodes and edges files in CTKP manifest data urls: {source_data_urls}') + data_puller = GetData() + data_puller.pull_via_http(nodes_url, self.data_path, saved_file_name=self.node_file_name) + data_puller.pull_via_http(edges_url, self.data_path, saved_file_name=self.edge_file_name) + """ + data_puller = GetData() + for source in self.data_files: + source_url = f"{self.data_url}{source}" + data_puller.pull_via_http(source_url, self.data_path) + return True + + def parse_data(self) -> dict: + """ + Parses the data file for graph nodes/edges and writes them to the KGX files. + + :return: ret_val: record counts + """ + + extractor = Extractor(file_writer=self.output_file_writer) + + # get the nodes + # it's not really necessary because normalization will overwrite the only information here (name and category) + nodes_file: str = os.path.join(self.data_path, self.node_file_name) + with open(nodes_file, 'r') as fp: + extractor.csv_extract(fp, + lambda line: line[NODESDATACOLS.ID.value], # subject id + lambda line: None, # object id + lambda line: None, # predicate + lambda line: {NAME: line[NODESDATACOLS.NAME.value], + NODE_TYPES: line[NODESDATACOLS.CATEGORY.value]}, # subject props + lambda line: {}, # object props + lambda line: {}, # edgeprops + comment_character=None, + delim='\t', + has_header_row=True) + + edges_file: str = os.path.join(self.data_path, self.edge_file_name) + with open(edges_file, 'r') as fp: + extractor.csv_extract(fp, + lambda line: line[EDGESDATACOLS.SUBJECT.value], # subject id + lambda line: line[EDGESDATACOLS.OBJECT.value], # object id + lambda line: line[EDGESDATACOLS.PREDICATE.value], # predicate + lambda line: {}, # subject props + lambda line: {}, # object props + lambda line: self.get_edge_properties(line), # edgeprops + comment_character=None, + delim='\t', + has_header_row=True) + + return extractor.load_metadata + + def get_edge_properties(self, line): + + supporting_studies = [] + pred = str(line[EDGESDATACOLS.PREDICATE.value]) + nctids = str(line[EDGESDATACOLS.NCTID.value]).split(',') + phases = str(line[EDGESDATACOLS.PHASE.value]).split(',') + status = str(line[EDGESDATACOLS.OVERALL_STATUS.value]).split(',') + enroll = str(line[EDGESDATACOLS.ENROLLMENT.value]).split(',') + en_typ = str(line[EDGESDATACOLS.ENROLLMENT_TYPE.value]).split(',') + max_phase = 0 + elevate_to_prediction = False + for nctid, phase, stat, enrollment, enrollment_type in zip(nctids, phases, status, enroll, en_typ): + if float(phase) > max_phase: + max_phase = float(phase) + try: + enrollment = int(enrollment) + except ValueError: + enrollment = -1 + + supporting_study_attributes = { + "id": nctid, + "tested_intervention": "unsure" if pred == "biolink:mentioned_in_trials_for" else "yes", + "phase": phase, + "status": stat, + "study_size": enrollment + } + # convert to TRAPI format + supporting_studies.append( + {"attribute_type_id": HAS_SUPPORTING_STUDY_RESULT, + "value": nctid, + "attributes": [{"attribute_type_id": key, + "value": value} for key, value in supporting_study_attributes.items()]}) + + # if pred == "biolink:in_clinical_trials_for" and max_phase >= 4: + # elevate_to_prediction = True + + if pred == self.treats_predicate: + primary_knowledge_source = self.provenance_id + aggregator_knowledge_sources = [self.aact_infores] + supporting_data_source = self.ctgov_infores + else: + primary_knowledge_source = self.ctgov_infores + aggregator_knowledge_sources = [self.aact_infores, self.provenance_id] + supporting_data_source = None + + edge_attributes = { + EDGE_ID: line[EDGESDATACOLS.ID.value], + PRIMARY_KNOWLEDGE_SOURCE: primary_knowledge_source, + AGGREGATOR_KNOWLEDGE_SOURCES: aggregator_knowledge_sources, + KNOWLEDGE_LEVEL: line[EDGESDATACOLS.KNOWLEDGE_LEVEL.value], + AGENT_TYPE: line[EDGESDATACOLS.AGENT_TYPE.value], + MAX_RESEARCH_PHASE: str(float(max_phase)), + "elevate_to_prediction": elevate_to_prediction, # this isn't in biolink so not using a constant for now + # note source_record_urls should be paired with specific knowledge sources but currently + # there's no implementation for that, just pass it as a normal attribute for now + "source_record_urls": [self.source_record_url + line[EDGESDATACOLS.ID.value]] + } + if supporting_data_source: + edge_attributes[SUPPORTING_DATA_SOURCE] = supporting_data_source + # to handle nested attributes, use the "attributes" property which supports TRAPI attributes as json strings + if supporting_studies: + edge_attributes["attributes"] = [json.dumps(study) for study in supporting_studies] + return edge_attributes diff --git a/parsers/cord19/src/loadCord19.py b/parsers/cord19/src/loadCord19.py index 9c7bf982..b300f63e 100644 --- a/parsers/cord19/src/loadCord19.py +++ b/parsers/cord19/src/loadCord19.py @@ -5,7 +5,7 @@ from Common.utils import GetData from Common.loader_interface import SourceDataLoader from Common.extractor import Extractor -from Common.node_types import AGGREGATOR_KNOWLEDGE_SOURCES, PRIMARY_KNOWLEDGE_SOURCE +from Common.biolink_constants import AGGREGATOR_KNOWLEDGE_SOURCES, PRIMARY_KNOWLEDGE_SOURCE # the data header columns for both nodes files are: diff --git a/parsers/drugcentral/src/loaddrugcentral.py b/parsers/drugcentral/src/loaddrugcentral.py index 428b4220..bd828a85 100644 --- a/parsers/drugcentral/src/loaddrugcentral.py +++ b/parsers/drugcentral/src/loaddrugcentral.py @@ -7,7 +7,8 @@ from Common.extractor import Extractor from Common.loader_interface import SourceDataLoader, SourceDataFailedError, SourceDataBrokenError from Common.utils import GetData, snakify -from Common.node_types import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, PUBLICATIONS +from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, PUBLICATIONS, \ + KNOWLEDGE_LEVEL, KNOWLEDGE_ASSERTION, AGENT_TYPE, MANUAL_AGENT from Common.prefixes import DRUGCENTRAL, MEDDRA, UMLS, UNIPROTKB, PUBMED from Common.predicates import DGIDB_PREDICATE_MAPPING from Common.db_connectors import PostgresConnector @@ -21,7 +22,20 @@ class DrugCentralLoader(SourceDataLoader): source_data_url = "https://drugcentral.org/download" license = "https://drugcentral.org/privacy" attribution = "https://drugcentral.org/about" - parsing_version: str = '1.3' + parsing_version: str = '1.5' + + omop_relationmap = {'off-label use': 'biolink:applied_to_treat', # is substance that treats + 'reduce risk': 'biolink:preventative_for_condition', # is substance that treats + 'contraindication': 'NCIT:C37933', # contraindication + 'symptomatic treatment': 'RO:0002606', # is substance that treats + 'indication': 'RO:0002606', # is substance that treats + 'diagnosis': 'DrugCentral:5271'} # there's only one row like this. + + act_type_to_knowledge_source_map = {'IUPHAR': 'infores:gtopdb', + 'KEGG DRUG': 'infores:kegg', + 'PDSP': 'infores:pdsp', + 'CHEMBL': 'infores:chembl', + 'DRUGBANK': 'infores:drugbank'} def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -30,16 +44,8 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) - self.omop_relationmap = {'off-label use': 'RO:0002606', # is substance that treats - 'reduce risk': 'RO:0002606', # is substance that treats - 'contraindication': 'NCIT:C37933', # contraindication - 'symptomatic treatment': 'RO:0002606', # is substance that treats - 'indication': 'RO:0002606', # is substance that treats - 'diagnosis': 'RO:0002606', # theres only one row like this. - } - self.data_url = 'https://unmtid-shinyapps.net/download/' - self.data_file = 'drugcentral.dump.08222022.sql.gz' + self.data_file = 'drugcentral.dump.11012023.sql.gz' self.adverse_event_predicate = 'biolink:has_adverse_event' @@ -58,16 +64,19 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): where a.target_id = dc.target_id and dc.component_id = c.id''' + self.unknown_knowledge_sources = set() + def get_latest_source_version(self) -> str: """ gets the version of the data :return: the version of the data """ - - # we could grab this dynamically from here http://juniper.health.unm.edu/tcrd/download/latest.README - # but it wouldn't be very helpful until we can automatically populate the DB - return '8_22_2022' + # There is currently no implementation for automatically loading the postgres database, + # so this might as well be hardcoded. + # + # It's not obvious what the best way to determine it would be anyway: https://unmtid-dbs.net/download/ + return '11_1_2023' def get_data(self): gd: GetData = GetData(self.logger.level) @@ -99,7 +108,9 @@ def parse_data(self): lambda line: self.omop_relationmap[line['relationship_name']], lambda line: {}, # subject props lambda line: {}, # object props - lambda line: {PRIMARY_KNOWLEDGE_SOURCE: DrugCentralLoader.provenance_id} # edge props + lambda line: {PRIMARY_KNOWLEDGE_SOURCE: DrugCentralLoader.provenance_id, + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} # edge props ) # adverse events @@ -109,9 +120,11 @@ def parse_data(self): lambda line: self.adverse_event_predicate, #It would be better if there were a mapping... lambda line: {}, # subject props lambda line: {}, # object props - lambda line: { 'FAERS_llr': line['llr'], - AGGREGATOR_KNOWLEDGE_SOURCES: [DrugCentralLoader.provenance_id], - PRIMARY_KNOWLEDGE_SOURCE: 'infores:faers' } # edge props + lambda line: {'FAERS_llr': line['llr'], + AGGREGATOR_KNOWLEDGE_SOURCES: [DrugCentralLoader.provenance_id], + PRIMARY_KNOWLEDGE_SOURCE: 'infores:faers', + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} # edge props ) # bioactivity. There are several rows in the main activity table (act_table_full) that include multiple accessions @@ -123,15 +136,23 @@ def parse_data(self): lambda line: get_bioactivity_predicate(line), lambda line: {}, # subject props lambda line: {}, # object props - lambda line: get_bioactivity_attributes(line) # edge props + lambda line: self.get_bioactivity_attributes(line) # edge props ) + if self.unknown_knowledge_sources: + self.logger.warning(f'Unmapped bioactivity act_type: {self.unknown_knowledge_sources}') + + # retrieve the lists of nodes and edges from the extractor self.final_node_list = extractor.nodes self.final_edge_list = extractor.edges + # It might seem like the following section doesn't do anything, + # but because the Extractor did not have a file writer, nothing has been written yet. + # Any nodes in self.final_node_list are written to file after parse_data, so that's how this works. + # find node properties for previously extracted nodes node_props_by_id = {} - # here we want all of the information from the structures table except the following columns + # here we want all the information from the structures table, except the following columns: unwanted_properties = ["cd_id", "cas_reg_no", "name", @@ -145,9 +166,10 @@ def parse_data(self): node_props_query = 'select * from structures' db_cursor.execute(node_props_query) rows = db_cursor.fetchall() + extracted_node_ids = extractor.get_node_ids() for row in rows: node_id = f"{DRUGCENTRAL}:{row.pop('id')}" - if node_id in extractor.get_node_ids(): + if node_id in extracted_node_ids: for prop in unwanted_properties: del row[prop] node_props_by_id[node_id] = row @@ -160,6 +182,27 @@ def parse_data(self): return extractor.load_metadata + def get_bioactivity_attributes(self, line): + edge_props = {KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} + if line['act_type'] is not None: + edge_props['affinity'] = line['act_value'] + edge_props['affinityParameter'] = line['act_type'] + if line['act_source'] == 'SCIENTIFIC LITERATURE' and line['act_source_url'] is not None: + papersource = line['act_source_url'] + if papersource.startswith('http://www.ncbi.nlm.nih.gov/pubmed'): + papersource = f'{PUBMED}:{papersource.split("/")[-1]}' + edge_props[PUBLICATIONS] = [papersource] + + edge_props[PRIMARY_KNOWLEDGE_SOURCE] = self.act_type_to_knowledge_source_map.get(line['act_source'], + self.provenance_id) + if edge_props[PRIMARY_KNOWLEDGE_SOURCE] == self.provenance_id: + if line["act_source"] != 'SCIENTIFIC LITERATURE' and line["act_source"] != 'UNKNOWN': + self.unknown_knowledge_sources.add(line["act_source"]) + else: + edge_props[AGGREGATOR_KNOWLEDGE_SOURCES] = [DrugCentralLoader.provenance_id] + return edge_props + def init_drugcentral_db(self): try: db_host = os.environ['DRUGCENTRAL_DB_HOST'] @@ -167,8 +210,14 @@ def init_drugcentral_db(self): db_password = os.environ['DRUGCENTRAL_DB_PASSWORD'] db_name = os.environ['DRUGCENTRAL_DB_NAME'] db_port = os.environ['DRUGCENTRAL_DB_PORT'] - except KeyError as k: - raise SourceDataFailedError(f'DRUGCENTRAL DB environment variables not set. ({repr(k)})') + except KeyError: + self.logger.warning('DRUGCENTRAL DB environment variables not set. Attempting to use public instance..') + db_host = 'unmtid-dbs.net' + db_user = 'drugman' + db_password = 'dosage' + db_name = 'drugcentral' + db_port = 5433 + # raise SourceDataFailedError(f'DRUGCENTRAL DB environment variables not set.') self.drug_central_db = PostgresConnector(db_host=db_host, db_user=db_user, @@ -247,31 +296,8 @@ def get_bioactivity_predicate(line): return predicate -def get_bioactivity_attributes(line): - edge_props = {} - if line['act_type'] is not None: - edge_props['affinity'] = line['act_value'] - edge_props['affinityParameter'] = line['act_type'] - if line['act_source'] == 'SCIENTIFIC LITERATURE' and line['act_source_url'] is not None: - edge_props[PRIMARY_KNOWLEDGE_SOURCE] = DrugCentralLoader.provenance_id - papersource = line['act_source_url'] - if papersource.startswith('http://www.ncbi.nlm.nih.gov/pubmed'): - papersource=f'{PUBMED}:{papersource.split("/")[-1]}' - edge_props[PUBLICATIONS] = [papersource] - else: - edge_props[AGGREGATOR_KNOWLEDGE_SOURCES] = [DrugCentralLoader.provenance_id] - if line['act_source'] == 'IUPHAR': - edge_props[PRIMARY_KNOWLEDGE_SOURCE] = 'infores:gtopdb' - elif line['act_source'] == 'KEGG DRUG': - edge_props[PRIMARY_KNOWLEDGE_SOURCE] = 'infores:kegg' - elif line['act_source'] == 'PDSP': - edge_props[PRIMARY_KNOWLEDGE_SOURCE] = 'infores:pdsp' - elif line['act_source'] == 'CHEMBL': - edge_props[PRIMARY_KNOWLEDGE_SOURCE] = 'infores:chembl' - else: - edge_props[PRIMARY_KNOWLEDGE_SOURCE] = DrugCentralLoader.provenance_id - del edge_props[AGGREGATOR_KNOWLEDGE_SOURCES] - return edge_props + + if __name__ == '__main__': # create a command line parser diff --git a/parsers/drugmechdb/src/drugmechdb_node_map.json b/parsers/drugmechdb/src/drugmechdb_node_map.json new file mode 100644 index 00000000..59887604 --- /dev/null +++ b/parsers/drugmechdb/src/drugmechdb_node_map.json @@ -0,0 +1,858 @@ +{ + "MESH:D016044": { + "id": "PR:000029194", + "name": "BCR/ABL fusion protein (human)" + }, + "interpro:IPR003440": { + "id": "GO:0003843", + "name": "1,3-beta-D-glucan synthase activity" + }, + "MESH:D015514": { + "id": "UMLS:C0029014", + "name": "Oncogene Proteins, Fusion" + }, + "DRUGBANK:DBMET03189": { + "id": "PUBCHEM.COMPOUND:126357", + "name": "5-Aza-2'-deoxycytidine-5'-triphosphate" + }, + "DRUGBANK:DBSALT001065": { + "id": "PUBCHEM.COMPOUND:240767", + "name": "Fluorometholone Acetate" + }, + "DRUGBANK:DBSALT001045": { + "id": "PUBCHEM.COMPOUND:84003", + "name": "Ketorolac tromethamine" + }, + "DRUGBANK:DBMET02573": { + "id": "PUBCHEM.COMPOUND:631051", + "name": "21-Desacetyl Deflazacort" + }, + "DRUGBANK:DBMET01698": { + "id": "PUBCHEM.COMPOUND:68178378", + "name": "6'beta-Hydroxylovastatin" + }, + "UniProtKB:A0A156J405": { + "id": "UniProtKB:A0A4Q2E8H5", + "name": "DNA topoisomerase 4 subunit A" + }, + "UniProtKB:A0A8A6J2U1": { + "id": "UniProtKB:A8GI27", + "name": "Elongation factor 4" + }, + "UniProtKB:S7IMP9": { + "id": "UniProtKB:A0A2D2D9D4", + "name": "Small ribosomal subunit protein uS9" + }, + "UniProtKB:A0A6C1LUF2": { + "id": "UniProtKB:A0A0J8RAV7", + "name": "Cytochrome P450 51" + }, + "UniProtKB:41972": { + "id": "UniProtKB:P41368", + "name": "Isoleucine--tRNA ligase" + }, + "UniProtKB:S7IK33": { + "id": "UniProtKB:A0A2D2D7W9", + "name": "Small ribosomal subunit protein uS4" + }, + "UniProtKB:P4984": { + "id": "UniProtKB:P49841", + "name": "Glycogen synthase kinase-3 beta" + }, + "UniProtKB:S2ZP52": { + "id": "UniProtKB:A0A1F1VU17", + "name": "Small ribosomal subunit protein uS4" + }, + "UniProtKB:A0A443X2G9": { + "id": "UniProtKB:W1DID1", + "name": "DNA gyrase subunit A" + }, + "UniProtKB:P3535": { + "id": "UniProtKB:P35354", + "name": "Prostaglandin G/H synthase 2" + }, + "UniProtKB:S3ABF8": { + "id": "UniProtKB:A0A448PPU9", + "name": "Small ribosomal subunit protein uS9" + }, + "UniProtKB:V5AL63": { + "id": "UniProtKB:A0A411IMP7", + "name": "Elongation factor G" + }, + "UniProtKB:S7JCG1": { + "id": "UniProtKB:A0A2D2D8E5", + "name": "Elongation factor G" + }, + "CHEBI:35472": { + "id": "CHEBI:67079", + "name": "anti-inflammatory agent" + }, + "CHEBI:50503": { + "id": "UMLS:C0282090", + "name": "Laxatives" + }, + "CHEBI:21241": { + "id": "CHEBI:176783", + "name": "vitamin C" + }, + "CHEBI:35705": { + "id": "MESH:D007166", + "name": "Immunosuppressive Agents" + }, + "CHEBI:77034": { + "id": "UMLS:C0026698", + "name": "mucolytic agents" + }, + "NCBITaxon:1535326": { + "id": "NCBITaxon:5476", + "name": "Candida albicans" + }, + "NCBITaxon:11103": { + "id": "NCBITaxon:3052230", + "name": "Hepacivirus hominis" + }, + "NCBITaxon:5519": { + "id": "NCBITaxon:55194", + "name": "Malassezia furfur" + }, + "REACT:R-HSA-8932339": { + "id": "REACT:R-HSA-9755511", + "name": "KEAP1-NFE2L2 pathway " + }, + "TIGR:02074": { + "id": "UniProtKB:A0A7U2H7Z3", + "name": "Penicillin-binding protein 1A" + }, + "GO:0003809": { + "id": "GO:0004252", + "name": "serine-type endopeptidase activity" + }, + "GO:0140603": { + "id": "GO:0016887", + "name": "ATP hydrolysis activity" + }, + "GO:0071442": { + "id": "GO:0010484", + "name": "histone H3 acetyltransferase activity" + }, + "GO:0070997": { + "id": "UMLS:C2754100", + "name": "neuron death" + }, + "GO:0000737": { + "id": "GO:0004519", + "name": "endonuclease activity" + }, + "GO:0036475": { + "id": "GO:0036480", + "name": "neuron intrinsic apoptotic signaling pathway in response to oxidative stress" + }, + "GO:0050828": { + "id": "GO:0043129", + "name": "surfactant homeostasis" + }, + "GO:005507": { + "id": "GO:0006879", + "name": "intracellular iron ion homeostasis" + }, + "GO:0071158": { + "id": "GO:0051726", + "name": "regulation of cell cycle" + }, + "GO:0090503": { + "id": "GO:0090305", + "name": "nucleic acid phosphodiester bond hydrolysis" + }, + "GO:0007048": { + "id": "UMLS:C0596263", + "name": "Carcinogenesis" + }, + "GO:0043631": { + "id": "GO:0180011", + "name": "cytosolic mRNA polyadenylation" + }, + "GO:0046855": { + "id": "GO:0043647", + "name": "inositol phosphate metabolic process" + }, + "GO:0044825": { + "id": "GO:0044823", + "name": "retroviral integrase activity" + }, + "GO:0036473": { + "id": "GO:0097468", + "name": "programmed cell death in response to reactive oxygen species" + }, + "GO:0001207": { + "id": "GO:0140713", + "name": "histone chaperone activity" + }, + "GO:0070265": { + "id": "GO:0097300", + "name": "programmed necrotic cell death" + }, + "GO:0055072": { + "id": "GO:0006879", + "name": "intracellular iron ion homeostasis" + }, + "MESH:D010062": { + "id": "UMLS:C0029967", + "name": "Ovulation Induction" + }, + "MESH:D034061": { + "id": "UMLS:C0221145", + "name": "Thrombopoiesis" + }, + "MESH:D002470": { + "id": "UMLS:C0007620", + "name": "Cell survival" + }, + "MESH:D061566": { + "id": "HGNC.FAMILY:184", + "name": "Sodium voltage-gated channels" + }, + "MESH:D000066829": { + "id": "UMLS:C0598958", + "name": "Neuroprotection" + }, + "MESH:D015221": { + "id": "HGNC.FAMILY:183", + "name": "Potassium channels" + }, + "MESH:D019065": { + "id": "UMLS:C0282629", + "name": "Virus Assembly" + }, + "MESH:D014779": { + "id": "UMLS:C0042774", + "name": "Virus Replication" + }, + "MESH:D008560": { + "id": "UMLS:C0025245", + "name": "Membrane Fluidity" + }, + "MESH:D014661": { + "id": "GO:0042310", + "name": "vasoconstriction" + }, + "MESH:D014590": { + "id": "UMLS:C0042130", + "name": "Uterine Contraction" + }, + "MESH:D020746": { + "id": "UMLS:C0288263", + "name": "L-Type Calcium Channels" + }, + "MESH:D018047": { + "id": "UMLS:C0001471", + "name": "Receptors, Purinergic P1" + }, + "MESH:D015227": { + "id": "UMLS:C0023775", + "name": "Lipid Peroxidation" + }, + "MESH:D054327": { + "id": "UMLS:C3853691", + "name": "Eye lubricant" + }, + "MESH:D002463": { + "id": "UMLS:C0007605", + "name": "Cell Membrane Permeability" + }, + "MESH:D018994": { + "id": "UMLS:C0027108", + "name": "Myosin Light Chains" + }, + "MESH:D015220": { + "id": "HGNC.FAMILY:182", + "name": "Calcium channels" + }, + "MESH:D016923": { + "id": "UMLS:C0007587", + "name": "Cell Death" + }, + "MESH:D012032": { + "id": "UMLS:C1720801", + "name": "Refractory Period, Electrophysiological" + }, + "MESH:D004167": { + "id": "UMLS:C0012550", + "name": "Diphtheria Toxin" + }, + "MESH:D036022": { + "id": "UMLS:C1136102", + "name": "Capsid Proteins" + }, + "MESH:D016147": { + "id": "UMLS:C0079427", + "name": "Tumor Suppressor Genes" + }, + "MESH:D010539": { + "id": "UMLS:C0031164", + "name": "Permeability" + }, + "MESH:D018168": { + "id": "HGNC.FAMILY:2069", + "name": "Retinoic acid receptors" + }, + "MESH:D018341": { + "id": "UMLS:C0001639", + "name": "Receptors, Adrenergic, alpha-2" + }, + "MESH:D012738": { + "id": "UMLS:C0036883", + "name": "Sex Hormone-Binding Globulin" + }, + "MESH:D055504": { + "id": "UMLS:C2350409", + "name": "Insulin Receptor Substrate Proteins" + }, + "MESH:D015513": { + "id": "UMLS:C0029005", + "name": "Oncogene Proteins" + }, + "MESH:D001610": { + "id": "UMLS:C2607955", + "name": "Beta radiation" + }, + "MESH:D018079": { + "id": "UMLS:C0206518", + "name": "GABA Receptor" + }, + "MESH:D010410": { + "id": "UMLS:C0030847", + "name": "Penile Erection" + }, + "MESH:D051098": { + "id": "UMLS:C1579754", + "name": "Glial Cell Line-Derived Neurotrophic Factors" + }, + "MESH:D017319": { + "id": "UMLS:C0162713", + "name": "Photosensitizing Agents" + }, + "MESH:D057705": { + "id": "UMLS:C2936222", + "name": "Transendothelial and Transepithelial Migration" + }, + "MESH:D016212": { + "id": "UMLS:C0040690", + "name": "Transforming Growth Factor beta" + }, + "MESH:D002199": { + "id": "UMLS:C0006906", + "name": "Capillary Permeability" + }, + "MESH:D053536": { + "id": "NCBIGene:707236", + "name": "KRT16" + }, + "MESH:D000071080": { + "id": "UMLS:C4277734", + "name": "Cortical Excitability" + }, + "MESH:D016084": { + "id": "HP:4000007", + "name": "Bronchoconstriction" + }, + "MESH:D000803": { + "id": "PUBCHEM.COMPOUND:3081372", + "name": "Angiotensin I" + }, + "MESH:D054368": { + "id": "UMLS:C0282090", + "name": "Laxatives" + }, + "MESH:D059365": { + "id": "GO:0000184", + "name": "nuclear-transcribed mRNA catabolic process, nonsense-mediated decay" + }, + "MESH:D011950": { + "id": "HGNC.FAMILY:175", + "name": "Cholinergic receptors" + }, + "MESH:D014655": { + "id": "UMLS:C0042380", + "name": "Vascular resistance" + }, + "MESH:D054856": { + "id": "UMLS:C1517336", + "name": "G-Quadruplexes" + }, + "MESH:D053553": { + "id": "UMLS:C1676737", + "name": "Keratin-6" + }, + "MESH:D017475": { + "id": "UMLS:C0132173", + "name": "Receptors, Nerve Growth Factor" + }, + "interpro:IPR013673": { + "id": "HGNC.FAMILY:276", + "name": "Potassium inwardly rectifying channel subfamily J" + }, + "interpro:IPR017790": { + "id": "UMLS:C0135892", + "name": "Penicillin-Binding Protein 2" + }, + "interpro:IPR000451": { + "id": "PANTHER.FAMILY:PTHR24169", + "name": "NUCLEAR FACTOR NF-KAPPA-B PROTEIN" + }, + "interpro:IPR000217": { + "id": "PANTHER.FAMILY:PTHR11588", + "name": "Tubulin" + }, + "interpro:IPR001054": { + "id": "HGNC.FAMILY:819", + "name": "Guanylate cyclases" + }, + "interpro:IPR019602": { + "id": "PANTHER.FAMILY:PTHR10367", + "name": "MRNA-CAPPING ENZYME" + }, + "interpro:IPR002289": { + "id": "UniProtKB:P28472", + "name": "Gamma-aminobutyric acid receptor subunit beta-3" + }, + "interpro:IPR028309": { + "id": "PANTHER.FAMILY:PTHR13742", + "name": "RETINOBLASTOMA-ASSOCIATED PROTEIN RB -RELATED" + }, + "interpro:IPR030826": { + "id": "MESH:D054680", + "name": "Ribosome Subunits, Small, Bacterial" + }, + "interpro:IPR001104": { + "id": "PANTHER.FAMILY:PTHR10556", + "name": "3-OXO-5-ALPHA-STEROID 4-DEHYDROGENASE" + }, + "interpro:IPR006390": { + "id": "UMLS:C0012314", + "name": "Dihydropteroate Synthase" + }, + "interpro:IPR003965": { + "id": "PANTHER.FAMILY:PTHR43775", + "name": "FATTY ACID SYNTHASE" + }, + "interpro:IPR005982": { + "id": "PANTHER.FAMILY:PTHR48105", + "name": "THIOREDOXIN REDUCTASE 1-RELATED-RELATED" + }, + "interpro:IPR005446": { + "id": "UniProtKB:Q13936", + "name": "Voltage-dependent L-type calcium channel subunit alpha-1C" + }, + "interpro:IPR005990": { + "id": "PANTHER.FAMILY:PTHR11911", + "name": "INOSINE-5-MONOPHOSPHATE DEHYDROGENASE RELATED" + }, + "interpro:IPR000499": { + "id": "HGNC.FAMILY:225", + "name": "Endothelin receptors" + }, + "interpro:IPR043502": { + "id": "PANTHER.FAMILY:PTHR10133", + "name": "DNA POLYMERASE I" + }, + "interpro:IPR001128": { + "id": "HGNC.FAMILY:862", + "name": "Cytochrome P450s" + }, + "interpro:IPR000114": { + "id": "PANTHER.FAMILY:PTHR12220", + "name": "50S/60S RIBOSOMAL PROTEIN L16" + }, + "interpro:IPR005742": { + "id": "PANTHER.FAMILY:PTHR11390", + "name": "PROKARYOTIC DNA TOPOISOMERASE" + }, + "interpro:IPR023088": { + "id": "UniProtKB:Q07343", + "name": "3',5'-cyclic-AMP phosphodiesterase 4B" + }, + "interpro:IPR001211": { + "id": "PANTHER.FAMILY:PTHR11716", + "name": "PHOSPHOLIPASE A2 FAMILY MEMBER" + }, + "interpro:IPR037532": { + "id": "UMLS:C0135893", + "name": "Penicillin-Binding Protein 3" + }, + "interpro:IPR001460": { + "id": "PANTHER.FAMILY:PTHR22935", + "name": "PENICILLIN-BINDING PROTEIN" + }, + "interpro:IPR011009": { + "id": "PANTHER.FAMILY:PTHR46448", + "name": "PROTEIN KINASE DOMAIN-CONTAINING PROTEIN" + }, + "interpro:IPR000837": { + "id": "HGNC.FAMILY:1258", + "name": "AP-1 transcription factor" + }, + "interpro:IPR013680": { + "id": "PANTHER.FAMILY:PTHR10166", + "name": "VOLTAGE-DEPENDENT CALCIUM CHANNEL SUBUNIT ALPHA-2/DELTA-RELATED" + }, + "interpro:IPR043371": { + "id": "PANTHER.FAMILY:PTHR11347", + "name": "CYCLIC NUCLEOTIDE PHOSPHODIESTERASE" + }, + "interpro:IPR000476": { + "id": "UMLS:C0018061", + "name": "Gonadotropins" + }, + "interpro:IPR000929": { + "id": "HGNC.FAMILY:181", + "name": "Dopamine receptors" + }, + "interpro:IPR022801": { + "id": "PR:000023862", + "name": "30S ribosomal protein S9" + }, + "interpro:IPR005445": { + "id": "HGNC.FAMILY:1512", + "name": "Calcium voltage-gated channel alpha1 subunits" + }, + "interpro:IPR003574": { + "id": "UniProtKB:P05231", + "name": "Interleukin-6" + }, + "interpro:IPR000003": { + "id": "HGNC.FAMILY:2086", + "name": "Retinoid X receptors" + }, + "interpro:IPR002955": { + "id": "UniProtKB:P10636", + "name": "Microtubule-associated protein tau" + }, + "interpro:IPR016248": { + "id": "PANTHER.FAMILY:PTHR19890", + "name": "FIBROBLAST GROWTH FACTOR RECEPTOR" + }, + "interpro:IPR017320": { + "id": "HGNC.FAMILY:989", + "name": "Histone deacetylase" + }, + "interpro:IPR023174": { + "id": "PANTHER.FAMILY:PTHR11347", + "name": "CYCLIC NUCLEOTIDE PHOSPHODIESTERASE" + }, + "interpro:IPR002205": { + "id": "PANTHER.FAMILY:PTHR11390", + "name": "PROKARYOTIC DNA TOPOISOMERASE" + }, + "interpro:IPR000477": { + "id": "UniProtKB:Q72547", + "name": "Reverse transcriptase/RNaseH" + }, + "interpro:IPR013760": { + "id": "PANTHER.FAMILY:PTHR11390", + "name": "PROKARYOTIC DNA TOPOISOMERASE" + }, + "interpro:PR001696": { + "id": "HGNC.FAMILY:1203", + "name": "Sodium voltage-gated channel alpha subunits" + }, + "interpro:IPR026899": { + "id": "UMLS:C5573839", + "name": "1,3-beta-glucan synthase component FKS1" + }, + "interpro:IPR031649": { + "id": "UMLS:C0288263", + "name": "L-Type Calcium Channels" + }, + "interpro:IPR035897": { + "id": "HGNC.FAMILY:1296", + "name": "TIR domain containing" + }, + "interpro:IPR002453": { + "id": "UniProtKB:Q13509", + "name": "Tubulin beta-3 chain" + }, + "interpro:IPR002117": { + "id": "UniProtKB:P04637", + "name": "Cellular tumor antigen p53" + }, + "interpro:IPR017950": { + "id": "UMLS:C0041945", + "name": "Urease" + }, + "interpro:IPR003938": { + "id": "HGNC.FAMILY:274", + "name": "Potassium voltage-gated channels" + }, + "interpro:IPR002227": { + "id": "UniProtKB:P14679", + "name": "Tyrosinase" + }, + "interpro:IPR002233": { + "id": "HGNC.FAMILY:169", + "name": "Adrenoceptors" + }, + "interpro:IPR000235": { + "id": "PR:000023860", + "name": "30S ribosomal protein S7" + }, + "interpro:IPR000584": { + "id": "UniProtKB:Q08289", + "name": "Voltage-dependent L-type calcium channel subunit beta-2" + }, + "interpro:IPR001418": { + "id": "HGNC.FAMILY:245", + "name": "Opioid receptors" + }, + "interpro:IPR001148": { + "id": "HGNC.FAMILY:460", + "name": "Carbonic anhydrases" + }, + "interpro:IPR001321": { + "id": "UniProtKB:Q16665", + "name": "Hypoxia-inducible factor 1-alpha" + }, + "interpro:IPR001254": { + "id": "UMLS:C0758336", + "name": "trypsin-like serine protease" + }, + "interpro:IPR001697": { + "id": "PANTHER.FAMILY:PTHR11817", + "name": "PYRUVATE KINASE" + }, + "interpro:IPR028325": { + "id": "HGNC.FAMILY:274", + "name": "Potassium voltage-gated channels" + }, + "interpro:IPR001873": { + "id": "PANTHER.FAMILY:PTHR11690", + "name": "AMILORIDE-SENSITIVE SODIUM CHANNEL-RELATED" + }, + "interpro:IPR015680": { + "id": "UMLS:C0756632", + "name": "glutamate-gated chloride channel" + }, + "interpro:IPR013759": { + "id": "PANTHER.FAMILY:PTHR11390", + "name": "PROKARYOTIC DNA TOPOISOMERASE" + }, + "interpro:IPR003084": { + "id": "HGNC.FAMILY:989", + "name": "Histone deacetylase" + }, + "interpro:IPR001245": { + "id": "PANTHER.FAMILY:PTHR46392", + "name": "DUAL SERINE/THREONINE AND TYROSINE PROTEIN KINASE" + }, + "interpro:IPR028809": { + "id": "HGNC.FAMILY:184", + "name": "Sodium voltage-gated channels" + }, + "interpro:IPR001170": { + "id": "HGNC.FAMILY:819", + "name": "Guanylate cyclases" + }, + "interpro:IPR006782": { + "id": "PANTHER.FAMILY:PTHR11633", + "name": "PLATELET-DERIVED GROWTH FACTOR" + }, + "interpro:IPR030848": { + "id": "PANTHER.FAMILY:PTHR43261", + "name": "TRANSLATION ELONGATION FACTOR G-RELATED" + }, + "interpro:IPR044109": { + "id": "GO:0004691", + "name": "cAMP-dependent protein kinase activity" + }, + "interpro:IPR005884": { + "id": "PR:000022686", + "name": "fumarate reductase flavoprotein subunit" + }, + "interpro:IPR000975": { + "id": "PANTHER.FAMILY:PTHR10078", + "name": "INTERLEUKIN-1 FAMILY MEMBER" + }, + "interpro:IPR000265": { + "id": "UniProtKB:P43115", + "name": "Prostaglandin E2 receptor EP3 subtype" + }, + "interpro:IPR002587": { + "id": "UniProtKB:Q9NPH2", + "name": "Inositol-3-phosphate synthase 1" + }, + "interpro:IPR001464": { + "id": "HGNC.FAMILY:404", + "name": "Annexins" + }, + "interpro:IPR009135": { + "id": "UniProtKB:P17948", + "name": "Vascular endothelial growth factor receptor 1" + }, + "interpro:IPR040125": { + "id": "PANTHER.FAMILY:PTHR10835", + "name": "SQUALENE MONOOXYGENASE" + }, + "interpro:IPR030672": { + "id": "HGNC.FAMILY:53", + "name": "Adenylate cyclases" + }, + "interpro:IPR034162": { + "id": "HGNC.FAMILY:1927", + "name": "Pepsinogens" + }, + "interpro:cd15058": { + "id": "UniProtKB:P08588", + "name": "Beta-1 adrenergic receptor" + }, + "interpro:IPR001320": { + "id": "PANTHER.FAMILY:PTHR18966", + "name": "IONOTROPIC GLUTAMATE RECEPTOR" + }, + "interpro:IPR005311": { + "id": "PANTHER.FAMILY:PTHR22935", + "name": "PANTHER.FAMILY:PTHR22935" + }, + "interpro:IPR001696": { + "id": "HGNC.FAMILY:1203", + "name": "Sodium voltage-gated channel alpha subunits" + }, + "interpro:SSF50353": { + "id": "UMLS:C0079189", + "name": "cytokine" + }, + "interpro:IPR002231": { + "id": "HGNC.FAMILY:171", + "name": "5-hydroxytryptamine receptors" + }, + "interpro:IPR005025": { + "id": "PANTHER.FAMILY:PTHR43355", + "name": "FLAVIN REDUCTASE (NADPH)" + }, + "interpro:IPR002441": { + "id": "UniProtKB:P14672", + "name": "Solute carrier family 2, facilitated glucose transporter member 4" + }, + "interpro:IPR001796": { + "id": "PANTHER.FAMILY:PTHR48069", + "name": "DIHYDROFOLATE REDUCTASE" + }, + "interpro:IPR047096": { + "id": "UniProtKB:P19838", + "name": "Nuclear factor NF-kappa-B p105 subunit" + }, + "interpro:IPR015476": { + "id": "UniProtKB:P06881", + "name": "Calcitonin Gene-Related Peptide" + }, + "interpro:IPR010526": { + "id": "HGNC.FAMILY:184", + "name": "Sodium voltage-gated channels" + }, + "interpro:IPR001634": { + "id": "HGNC.FAMILY:211", + "name": "Adenosine receptors" + }, + "interpro:IPR035516": { + "id": "PANTHER.FAMILY:PTHR11390", + "name": "PROKARYOTIC DNA TOPOISOMERASE" + }, + "interpro:IPR003461": { + "id": "HGNC.FAMILY:1109", + "name": "Keratins" + }, + "interpro:IPR002403": { + "id": "PANTHER.FAMILY:PTHR24286:SF191", + "name": "LANOSTEROL 14-ALPHA DEMETHYLASE" + }, + "interpro:IPR012338": { + "id": "PANTHER.FAMILY:PTHR11935", + "name": "BETA LACTAMASE DOMAIN" + }, + "interpro:IPR002394": { + "id": "HGNC.FAMILY:173", + "name": "Cholinergic receptors nicotinic subunits" + }, + "interpro:IPR003545": { + "id": "UniProtKB:O14746", + "name": "Telomerase reverse transcriptase" + }, + "interpro:IPR000995": { + "id": "HGNC.FAMILY:180", + "name": "holinergic receptors muscarinic" + }, + "interpro:IPR022423": { + "id": "PANTHER.FAMILY:PTHR46987", + "name": "NEUROHYPOPHYSIAL HORMONES, N-TERMINAL DOMAIN CONTAINING PROTEIN" + }, + "interpro:IPR023031": { + "id": "UniProtKB:P11172", + "name": "Uridine 5'-monophosphate synthase" + }, + "interpro:IPR006028": { + "id": "HGNC.FAMILY:563", + "name": "Gamma-aminobutyric acid type A receptor subunits" + }, + "PFAM:PF00809": { + "id": "PR:000022682", + "name": "dihydropteroate synthase" + }, + "PFAM:PF00446": { + "id": "PANTHER.FAMILY:PTHR10522", + "name": "GONADOLIBERIN" + }, + "PFAM:PF11590": { + "id": "PANTHER.FAMILY:PTHR10322", + "name": "DNA POLYMERASE CATALYTIC SUBUNIT" + }, + "PFAM:PF00466": { + "id": "PR:000023816", + "name": "50S ribosomal protein L10" + }, + "PFAM:PF00858": { + "id": "PANTHER.FAMILY:PTHR11690", + "name": "AMILORIDE-SENSITIVE SODIUM CHANNEL-RELATED" + }, + "PFAM:PF00186": { + "id": "PANTHER.FAMILY:PTHR48069", + "name": "DIHYDROFOLATE REDUCTASE" + }, + "PFAM:PF00067": { + "id": "HGNC.FAMILY:862", + "name": "Cytochrome P450s" + }, + "PFAM:PF01007": { + "id": "HGNC.FAMILY:276", + "name": "Potassium inwardly rectifying channel subfamily J" + }, + "PFAM:PF00237": { + "id": "PR:000023816", + "name": "50S ribosomal protein L10" + }, + "PFAM:PF02898": { + "id": "PANTHER.FAMILY:PTHR19384", + "name": "NITRIC OXIDE SYNTHASE-RELATED" + }, + "PFAM:PF03520": { + "id": "HGNC.FAMILY:274", + "name": "Potassium voltage-gated channels" + }, + "PFAM:PF00905": { + "id": "PANTHER.FAMILY:PTHR22935", + "name": "PENICILLIN-BINDING PROTEIN" + }, + "PFAM:PF06753": { + "id": "PUBCHEM.COMPOUND:439201", + "name": "Bradykinin" + }, + "PFAM:PF06589": { + "id": "UniProtKB:P04926", + "name": "Malaria protein EXP-1" + }, + "PFAM:PF00449": { + "id": "UMLS:C0041945", + "name": "Urease" + }, + "PFAM:PF06512": { + "id": "HGNC.FAMILY:184", + "name": "Sodium voltage-gated channels" + } +} \ No newline at end of file diff --git a/parsers/drugmechdb/src/drugmechdb_predicate_map.json b/parsers/drugmechdb/src/drugmechdb_predicate_map.json new file mode 100644 index 00000000..76b060c8 --- /dev/null +++ b/parsers/drugmechdb/src/drugmechdb_predicate_map.json @@ -0,0 +1,192 @@ +{ + "biolink:affects_risk_for": { + "inverted": false, + "predicate": "biolink:predisposes_to_condition", + "properties": { + "qualified_predicate": "", + "object_direction_qualifier":"", + "object_aspect_qualifier":"" + } + }, + "biolink:ameliorates": { + "inverted": false, + "predicate": "biolink:ameliorates_condition", + "properties": { + "qualified_predicate": "", + "object_direction_qualifier":"", + "object_aspect_qualifier":"" + } + }, + "biolink:contraindicated_for": { + "inverted": false, + "predicate": "biolink:contraindicated_in", + "properties": { + "qualified_predicate": "", + "object_direction_qualifier":"", + "object_aspect_qualifier":"" + } + }, + "biolink:decreases_abundance_of": { + "inverted": false, + "predicate": "biolink:affects", + "properties": { + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"decreased", + "object_aspect_qualifier":"abundance" + } + + }, + "biolink:decreases_activity_of": { + "inverted": false, + "predicate": "biolink:affects", + "properties": { + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"decreased", + "object_aspect_qualifier":"activity" + } + }, + "biolink:decreases_expression_of": { + "inverted": false, + "predicate": "biolink:affects", + "properties": { + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"decreased", + "object_aspect_qualifier":"expression" + } + }, + "biolink:decreases_synthesis_of": { + "inverted": false, + "predicate": "biolink:affects", + "properties": { + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"decreased", + "object_aspect_qualifier":"synthesis" + } + }, + "biolink:decreases_uptake_of": { + "inverted": false, + "predicate": "biolink:affects", + "properties": { + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"decreased", + "object_aspect_qualifier":"uptake" + } + }, + "biolink:directly_interacts_with": { + "inverted": false, + "predicate": "biolink:directly_physically_interacts_with", + "properties": { + "qualified_predicate": "", + "object_direction_qualifier":"", + "object_aspect_qualifier":"" + } + }, + "biolink:exacerbates": { + "inverted": false, + "predicate": "biolink:related_to", + "properties": { + "qualified_predicate": "", + "object_direction_qualifier":"", + "object_aspect_qualifier":"" + } + }, + "biolink:increases_abundance_of": { + "inverted": false, + "predicate": "biolink:affects", + "properties": { + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"increased", + "object_aspect_qualifier":"abundance" + } + }, + "biolink:increases_activity_of": { + "inverted": false, + "predicate": "biolink:affects", + "properties": { + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"increased", + "object_aspect_qualifier":"activity" + } + }, + "biolink:increases_degradation_of": { + "inverted": false, + "predicate": "biolink:affects", + "properties": { + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"increased", + "object_aspect_qualifier":"degradation" + } + }, + "biolink:increases_expression_of": { + "inverted": false, + "predicate": "biolink:affects", + "properties": { + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"increased", + "object_aspect_qualifier":"expression" + } + }, + "biolink:increases_stability_of": { + "inverted": false, + "predicate": "biolink:affects", + "properties": { + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"increased", + "object_aspect_qualifier":"stability" + } + }, + "biolink:increases_transport_of": { + "inverted": false, + "predicate": "biolink:affects", + "properties": { + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"increased", + "object_aspect_qualifier":"transport" + } + }, + "biolink:molecularly_interacts_with": { + "inverted": false, + "predicate": "biolink:directly_physically_interacts_with", + "properties": { + "qualified_predicate": "", + "object_direction_qualifier":"", + "object_aspect_qualifier":"" + } + }, + "biolink:negatively_regulates": { + "inverted": false, + "predicate": "biolink:regulates", + "properties": { + "qualified_predicate": "", + "object_direction_qualifier":"upregulates", + "object_aspect_qualifier":"" + } + }, + "biolink:positively_regulates": { + "inverted": false, + "predicate": "biolink:regulates", + "properties": { + "qualified_predicate": "", + "object_direction_qualifier":"upregulates", + "object_aspect_qualifier":"" + } + }, + "biolink:predisposes": { + "inverted": false, + "predicate": "biolink:predisposes_to_condition", + "properties": { + "qualified_predicate": "", + "object_direction_qualifier":"", + "object_aspect_qualifier":"" + } + }, + "biolink:prevents": { + "inverted": false, + "predicate": "biolink:preventative_for_condition", + "properties": { + "qualified_predicate": "", + "object_direction_qualifier":"", + "object_aspect_qualifier":"" + } + } +} diff --git a/parsers/drugmechdb/src/loadDrugMechDB.py b/parsers/drugmechdb/src/loadDrugMechDB.py index ad327400..23dac718 100644 --- a/parsers/drugmechdb/src/loadDrugMechDB.py +++ b/parsers/drugmechdb/src/loadDrugMechDB.py @@ -2,24 +2,23 @@ import requests as rq import os import pandas as pd +import ast from Common.utils import GetData from Common.loader_interface import SourceDataLoader +from Common.kgxmodel import kgxnode, kgxedge from Common.extractor import Extractor +from Common.biolink_constants import KNOWLEDGE_LEVEL, KNOWLEDGE_ASSERTION, AGENT_TYPE, MANUAL_AGENT, \ + QUALIFIED_PREDICATE, OBJECT_ASPECT_QUALIFIER, OBJECT_DIRECTION_QUALIFIER def load_json(json_data): - with open(json_data, encoding="utf-8") as file: + with open(json_data, encoding="utf-8-sig") as file: data = json.load(file) file.close() return data - -# # Example usage -# json_file = 'indication_paths.json' -# csv_file = 'indication_paths.csv' -# data = load_json(json_file) ############## -# Class: Load in direct Gene/Protein-[biolink:target_for]->Disease relationships from DrugMechDB +# Class: Load in full Clinical Outcome Pathways and direct Gene/Protein-[biolink:target_for]->Disease relationships from DrugMechDB. # By: Jon-Michael Beasley # Date: 09/06/2023 ############## @@ -31,7 +30,7 @@ class DrugMechDBLoader(SourceDataLoader): source_data_url = "https://github.com/SuLab/DrugMechDB/raw/main/indication_paths.json" license = "SuLab/DrugMechDB is licensed under the Creative Commons Zero v1.0 Universal license" attribution = 'https://sulab.github.io/DrugMechDB/' - parsing_version = '1.1' + parsing_version = '1.3' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -40,11 +39,15 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ # call the super super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) - self.drugmechdb_version = '202307' # TODO temporarily hard coded - #self.drugmechdb_version = self.get_latest_source_version() + self.drugmechdb_version = self.get_latest_source_version() self.drugmechdb_data_url = f"https://github.com/SuLab/DrugMechDB/raw/main/" self.drugmechdb_file_name = f"indication_paths.json" self.data_files = [self.drugmechdb_file_name] + self.predicate_mapping: str = 'drugmechdb_predicate_map.json' + self.node_mapping: str = 'drugmechdb_node_map.json' + self.mapping_filepath = os.path.dirname(os.path.abspath(__file__)) + self.predicate_mapping_file = os.path.join(self.mapping_filepath, self.predicate_mapping) + self.node_mapping_file = os.path.join(self.mapping_filepath, self.node_mapping) #TODO Write the function below to get latest update version from https://sulab.github.io/DrugMechDB/ def get_latest_source_version(self) -> str: @@ -52,14 +55,11 @@ def get_latest_source_version(self) -> str: gets the latest version of the data :return: """ - if self.drugmechdb_version: - return self.drugmechdb_version ### The method below gets the database version from the html, but this may be subject to change. ### - drugmechdb_download_page_response = rq.get('https://www.bindingdb.org/rwd/bind/chemsearch/marvin/Download.jsp') - version_index = drugmechdb_download_page_response.text.index('BindingDB_All_2D_') + 17 - bindingdb_version = drugmechdb_download_page_response.text[version_index:version_index + 6] - - return f"{bindingdb_version}" + drugmechdb_download_page_response = rq.get('https://github.com/SuLab/DrugMechDB') + version_index = drugmechdb_download_page_response.text.index('/SuLab/DrugMechDB/releases/tag/') + 31 + drugmechdb_version = drugmechdb_download_page_response.text[version_index:version_index + 5] + return f"{drugmechdb_version}" def get_data(self) -> int: """ @@ -72,12 +72,30 @@ def get_data(self) -> int: data_puller.pull_via_http(source_url, self.data_path) i+=1 return True + + def fix_node(self,node_id,mapping_dictionary): + fixed_node = node_id.replace('UniProt:', 'UniProtKB:').replace('InterPro:','interpro:').replace('reactome:','REACT:').replace('taxonomy:','NCBITaxon:').replace('Pfam:','PFAM:').replace('DB:','DRUGBANK:').replace('\ufeff','') + if fixed_node in mapping_dictionary.keys(): + fixed_node = mapping_dictionary[fixed_node]["id"] + return fixed_node + def parse_data(self) -> dict: """ Parses the data file for graph nodes/edges :return: ret_val: load_metadata """ + ### This dict stores the edges created for each entry, then it will be grouped and agggregated to merge drugmechdb path ids. + source_target_pair_dict = { + "dmdb_ids":[], + "source_ids":[], + "target_ids":[], + "predicates":[], + "qualified_predicates":[], + "object_direction_qualifiers":[], + "object_aspect_qualifiers":[] + } + ### This dict stores the edges created for the new "biolink:target_for" edges, then it will be grouped and agggregated to merge drugmechdb path ids. triple_pair_dict = { "dmdb_ids":[], "drug_names":[], @@ -90,6 +108,14 @@ def parse_data(self) -> dict: } data = load_json(os.path.join(self.data_path,self.drugmechdb_file_name)) + + # init the record counters + record_counter: int = 0 + skipped_record_counter: int = 0 + with open(self.predicate_mapping_file, "r") as pm: + predicate_mapping = json.load(pm) + with open(self.node_mapping_file, "r") as nm: + node_mapping = json.load(nm) for entry in data: dmdb_id = entry["graph"]["_id"] drug_name = entry["graph"]["drug"] @@ -98,21 +124,45 @@ def parse_data(self) -> dict: disease_name = entry["graph"]["disease"] disease_mesh = entry["graph"]["disease_mesh"] links = entry["links"] - for i in range(len(links)): triple = links[i] - if triple["source"] == drug_mesh: - source = triple["source"] - predicate = "biolink:" + triple["key"].replace(" ","_") - target = triple["target"] - nodes = entry["nodes"] + source_target_pair_dict["dmdb_ids"].append(dmdb_id) + + source = triple["source"] + fixed_source = self.fix_node(source,node_mapping) + source_target_pair_dict["source_ids"].append(fixed_source) + output_node = kgxnode(fixed_source) + self.output_file_writer.write_kgx_node(output_node) + + target = triple["target"] + fixed_target = self.fix_node(target,node_mapping) + source_target_pair_dict["target_ids"].append(fixed_target) + output_node = kgxnode(fixed_target) + self.output_file_writer.write_kgx_node(output_node) + + predicate = "biolink:" + triple["key"].replace(" ","_") + if predicate in predicate_mapping.keys(): + source_target_pair_dict["qualified_predicates"].append(predicate_mapping[predicate]["properties"]["qualified_predicate"]) + source_target_pair_dict["object_direction_qualifiers"].append(predicate_mapping[predicate]["properties"]["object_direction_qualifier"]) + source_target_pair_dict["object_aspect_qualifiers"].append(predicate_mapping[predicate]["properties"]["object_aspect_qualifier"]) + predicate = predicate_mapping[predicate]["predicate"] + else: + source_target_pair_dict["qualified_predicates"].append("") + source_target_pair_dict["object_direction_qualifiers"].append("") + source_target_pair_dict["object_aspect_qualifiers"].append("") + + source_target_pair_dict["predicates"].append(predicate) + + ### The next section finds the drug target for assigning "biolink:target_for" edges. + nodes = entry["nodes"] + if source == drug_mesh: for node in nodes: - if (node["id"] == target) and (node["label"] == "Protein"): + if (node["id"] == target) and (node["label"] in ["Protein","GeneFamily"]): drug_target_name = node["name"] - drug_target_uniprot = node["id"].replace('UniProt:', 'UniProtKB:') - + drug_target_uniprot = self.fix_node(node["id"],node_mapping) + disease_mesh = self.fix_node(disease_mesh,node_mapping) triple_pair_dict["dmdb_ids"].append(dmdb_id) triple_pair_dict["drug_names"].append(drug_name) triple_pair_dict["drug_meshs"].append(drug_mesh) @@ -121,15 +171,16 @@ def parse_data(self) -> dict: triple_pair_dict["drug_target_uniprots"].append(drug_target_uniprot) triple_pair_dict["disease_names"].append(disease_name) triple_pair_dict["disease_meshs"].append(disease_mesh) - + + ### If the next node after the drug is a metabolite of the drug, then go forward one link and check if the next node is the target. elif node["id"] == target and node["label"] in ["Drug","ChemicalSubstance"]: if entry["links"][i+1]["source"] == node["id"]: new_target = entry["links"][i+1]["target"] for node in nodes: - if (node["id"] == new_target) and (node["label"] == "Protein"): + if (node["id"] == new_target) and (node["label"] in ["Protein","GeneFamily"]): drug_target_name = node["name"] - drug_target_uniprot = node["id"].replace('UniProt:', 'UniProtKB:') - + drug_target_uniprot = self.fix_node(node["id"],node_mapping) + disease_mesh = self.fix_node(disease_mesh,node_mapping) triple_pair_dict["dmdb_ids"].append(dmdb_id) triple_pair_dict["drug_names"].append(drug_name) triple_pair_dict["drug_meshs"].append(drug_mesh) @@ -140,27 +191,47 @@ def parse_data(self) -> dict: triple_pair_dict["disease_meshs"].append(disease_mesh) else: continue - # print(len(triple_pair_dict["dmdb_ids"])) - # print(len(triple_pair_dict["drug_meshs"])) - # print(len(triple_pair_dict["drug_drugbanks"])) - # print(len(triple_pair_dict["drug_target_names"])) - # print(len(triple_pair_dict["drug_target_uniprots"])) - # print(len(triple_pair_dict["disease_meshs"])) + + df = pd.DataFrame(source_target_pair_dict) + df = df.groupby(["source_ids","target_ids","predicates","qualified_predicates","object_direction_qualifiers","object_aspect_qualifiers"], as_index=False).agg(list).reset_index(drop=True) + df['dmdb_ids'] = df['dmdb_ids'].apply(lambda x: list(set(x))) ###Removes duplicates + for index, row in df.iterrows(): + edge_props = {"drugmechdb_path_id": row["dmdb_ids"], + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} + if row["qualified_predicates"] != "": + edge_props[QUALIFIED_PREDICATE] = row["qualified_predicates"] + if row["object_direction_qualifiers"] != "": + edge_props[OBJECT_DIRECTION_QUALIFIER] = row["object_direction_qualifiers"] + if row["object_aspect_qualifiers"] != "": + edge_props[OBJECT_ASPECT_QUALIFIER] = row["object_aspect_qualifiers"] + output_edge = kgxedge( + subject_id=row["source_ids"], + object_id=row["target_ids"], + predicate=row["predicates"], + edgeprops=edge_props, + primary_knowledge_source=self.provenance_id + ) + self.output_file_writer.write_kgx_edge(output_edge) + + ### Saves the "biolink:target_for" edges as a CSV file, which is useful as a benchmarking dataset. df = pd.DataFrame(triple_pair_dict) - print(len(df)) + df= df.groupby(["drug_names","drug_meshs","drug_drugbanks","drug_target_names","drug_target_uniprots","disease_names","disease_meshs"], as_index=False).agg(list).reset_index(drop=True) + df['dmdb_ids'] = df['dmdb_ids'].apply(lambda x: list(set(x))) ###Removes duplicates csv_file_name = os.path.join(self.data_path,"indication_paths.csv") df.to_csv(csv_file_name) - #TODO Figure out how to parse the triple store as a dictionary extractor = Extractor(file_writer=self.output_file_writer) with open(csv_file_name, 'rt') as fp: extractor.csv_extract(fp, - lambda line: line[6], # subject id - lambda line: line[8], # object id + lambda line: line[5], # subject id + lambda line: line[7], # object id lambda line: "biolink:target_for", lambda line: {}, #Node 1 props lambda line: {}, #Node 2 props - lambda line: {}, #Edge props + lambda line: {"drugmechdb_path_id": ast.literal_eval(line[8]), + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT}, #Edge props comment_character=None, delim=",", has_header_row=True diff --git a/parsers/gtopdb/src/loadGtoPdb.py b/parsers/gtopdb/src/loadGtoPdb.py index f7127e4d..71625048 100644 --- a/parsers/gtopdb/src/loadGtoPdb.py +++ b/parsers/gtopdb/src/loadGtoPdb.py @@ -11,7 +11,7 @@ from Common.prefixes import GTOPDB, HGNC, ENSEMBL, PUBMED from Common.kgxmodel import kgxnode, kgxedge from Common.predicates import DGIDB_PREDICATE_MAPPING -from Common.node_types import PUBLICATIONS, AFFINITY, AFFINITY_PARAMETER +from Common.biolink_constants import * class INTERACTIONS_COLS(enum.Enum): @@ -57,7 +57,7 @@ class GtoPdbLoader(SourceDataLoader): source_data_url = "http://www.guidetopharmacology.org/" license = "https://www.guidetopharmacology.org/about.jsp#license" attribution = "https://www.guidetopharmacology.org/citing.jsp" - parsing_version: str = '1.2' + parsing_version: str = '1.4' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -199,10 +199,15 @@ def process_peptides(self, file_path: str) -> (int, int): part_node = kgxnode(part_node_id, name=part_node_name) self.output_file_writer.write_kgx_node(part_node) + edge_props = { + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT + } new_edge = kgxedge(ligand_id, part_node_id, predicate=self.has_part_predicate, - primary_knowledge_source=self.provenance_id) + primary_knowledge_source=self.provenance_id, + edgeprops=edge_props) self.output_file_writer.write_kgx_edge(new_edge) else: skipped_record_counter += 1 @@ -253,17 +258,19 @@ def process_interactions(self, file_path: str, gene_symbol_to_id_map: dict) -> ( ligand_node = kgxnode(ligand_id, name=ligand_name) self.output_file_writer.write_kgx_node(ligand_node) - props: dict = {'primaryTarget': True if r[INTERACTIONS_COLS.PRIMARY_TARGET.value] == 'true' else False, + edge_props: dict = {'primaryTarget': True if r[INTERACTIONS_COLS.PRIMARY_TARGET.value] == 'true' else False, AFFINITY_PARAMETER: r[INTERACTIONS_COLS.AFFINITY_UNITS.value], - 'endogenous': True if r[INTERACTIONS_COLS.ENDOGENOUS.value] == 'true' else False} + 'endogenous': True if r[INTERACTIONS_COLS.ENDOGENOUS.value] == 'true' else False, + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} # check the affinity median and ensure it is a float if r[INTERACTIONS_COLS.AFFINITY_MEDIAN.value] != '': - props.update({AFFINITY: float(r[INTERACTIONS_COLS.AFFINITY_MEDIAN.value])}) + edge_props[AFFINITY] = float(r[INTERACTIONS_COLS.AFFINITY_MEDIAN.value]) # if there are publications add them in if r[INTERACTIONS_COLS.PUBMED_ID.value] != '': - props.update({PUBLICATIONS: [f'{PUBMED}:{x}' for x in r[INTERACTIONS_COLS.PUBMED_ID.value].split('|')]}) + edge_props[PUBLICATIONS] = [f'{PUBMED}:{x}' for x in r[INTERACTIONS_COLS.PUBMED_ID.value].split('|')] genes = r[INTERACTIONS_COLS.TARGET_ENSEMBL_GENE_ID.value].split('|') gene_names = r[INTERACTIONS_COLS.TARGET_GENE_SYMBOLS.value].split('|') @@ -277,7 +284,7 @@ def process_interactions(self, file_path: str, gene_symbol_to_id_map: dict) -> ( gene_id, predicate=predicate, primary_knowledge_source=self.provenance_id, - edgeprops=props) + edgeprops=edge_props) self.output_file_writer.write_kgx_edge(new_edge) if "Human" in r[INTERACTIONS_COLS.LIGAND_SPECIES.value] \ @@ -291,15 +298,16 @@ def process_interactions(self, file_path: str, gene_symbol_to_id_map: dict) -> ( gene_node = kgxnode(gene_id, name=gene_symbol) self.output_file_writer.write_kgx_node(gene_node) - props: dict = {} - if r[INTERACTIONS_COLS.PUBMED_ID.value] != '': - props.update({PUBLICATIONS: [f'{PUBMED}:{x}' for x in r[INTERACTIONS_COLS.PUBMED_ID.value].split('|')]}) + edge_props: dict = {KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} + if r[INTERACTIONS_COLS.PUBMED_ID.value]: + edge_props[PUBLICATIONS] = [f'{PUBMED}:{x}' for x in r[INTERACTIONS_COLS.PUBMED_ID.value].split('|')] new_edge = kgxedge(gene_id, ligand_id, predicate=self.has_gene_product_predicate, primary_knowledge_source=self.provenance_id, - edgeprops=props) + edgeprops=edge_props) self.output_file_writer.write_kgx_edge(new_edge) else: skipped_record_counter += 1 diff --git a/parsers/hetio/src/loadHetio.py b/parsers/hetio/src/loadHetio.py index c2c1ea4e..e78c4ed7 100644 --- a/parsers/hetio/src/loadHetio.py +++ b/parsers/hetio/src/loadHetio.py @@ -8,7 +8,7 @@ from Common.loader_interface import SourceDataLoader from Common.extractor import Extractor from Common.prefixes import NCBIGENE, DRUGBANK, UBERON, DOID, MESH, UMLS -from Common.node_types import AGGREGATOR_KNOWLEDGE_SOURCES, PRIMARY_KNOWLEDGE_SOURCE +from Common.biolink_constants import * class HetioLoader(SourceDataLoader): @@ -19,7 +19,7 @@ class HetioLoader(SourceDataLoader): source_data_url = "https://github.com/hetio/hetionet/blob/master/hetnet/json/hetionet-v1.0.json.bz2" license = "https://het.io/about/" attribution = "https://het.io/about/" - parsing_version: str = '1.4' + parsing_version: str = '1.5' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -210,7 +210,10 @@ def get_predicate_from_edge(edge, kind_to_abbrev_lookup): def get_edge_properties(edge): - edge_props = {} + edge_props = { + KNOWLEDGE_LEVEL: NOT_PROVIDED, + AGENT_TYPE: NOT_PROVIDED + } edge_data = edge['data'] if 'source' in edge_data: edge_sources = [edge_data['source']] diff --git a/parsers/hgnc/src/loadHGNC.py b/parsers/hgnc/src/loadHGNC.py index 04d72156..9eee0d56 100644 --- a/parsers/hgnc/src/loadHGNC.py +++ b/parsers/hgnc/src/loadHGNC.py @@ -6,6 +6,7 @@ from Common.loader_interface import SourceDataLoader from Common.kgxmodel import kgxnode, kgxedge from Common.prefixes import HGNC, HGNC_FAMILY +from Common.biolink_constants import * ############## @@ -23,7 +24,7 @@ class HGNCLoader(SourceDataLoader): source_data_url = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/" license = "https://www.genenames.org/about/" attribution = "https://www.genenames.org/about/" - parsing_version: str = '1.1' + parsing_version: str = '1.2' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -33,7 +34,10 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) self.complete_set_file_name = 'hgnc_complete_set.txt' - self.data_files: list = [self.complete_set_file_name, 'hgnc_genes_in_groups.txt'] + # self.gene_groups_file_name = 'hgnc_genes_in_groups.txt' + self.data_files: list = [self.complete_set_file_name, + # self.gene_groups_file_name + ] self.test_mode: bool = test_mode self.source_db: str = 'HUGO Gene Nomenclature Committee' @@ -61,24 +65,11 @@ def get_data(self) -> int: """ # get a reference to the data gathering class gd: GetData = GetData(self.logger.level) + file_count: int = gd.pull_via_ftp(self.ftp_site, self.ftp_dir, [self.complete_set_file_name], self.data_path) - # TODO - # if self.test_mode: - # set up test data instead - # else: - # get the complete data set - file_count: int = gd.pull_via_ftp(self.ftp_site, self.ftp_dir, [self.data_files[0]], self.data_path) + # get the gene groups dataset + # byte_count: int = gd.pull_via_http('https://www.genenames.org/cgi-bin/genegroup/download-all/' + self.self.gene_groups_file_name, self.data_path) - # did we get the file - if file_count > 0: - # get the gene groups dataset - byte_count: int = gd.pull_via_http('https://www.genenames.org/cgi-bin/genegroup/download-all/' + self.data_files[1], self.data_path) - - # did we get the data - if byte_count > 0: - file_count += 1 - - # return the file count to the caller return file_count def parse_data(self) -> dict: @@ -141,11 +132,12 @@ def parse_data(self) -> dict: self.final_node_list.append(gene_family_node) # get the baseline properties - props = {} + props = {KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} # were there publications if len(r['pubmed_id']) > 0: - props.update({'publications': ['PMID:' + v for v in r['pubmed_id'].split('|')]}) + props[PUBLICATIONS] = ['PMID:' + v for v in r['pubmed_id'].split('|')] # create the gene to gene family edge new_edge = kgxedge(gene_family_curie, diff --git a/parsers/hmdb/src/loadHMDB.py b/parsers/hmdb/src/loadHMDB.py index acfe1042..5134195b 100644 --- a/parsers/hmdb/src/loadHMDB.py +++ b/parsers/hmdb/src/loadHMDB.py @@ -6,7 +6,8 @@ from bs4 import BeautifulSoup from zipfile import ZipFile -from Common.utils import LoggingUtil, GetData +from Common.biolink_constants import * +from Common.utils import GetData from Common.loader_interface import SourceDataLoader from Common.prefixes import CTD, HMDB, OMIM, UNIPROTKB from Common.kgxmodel import kgxnode, kgxedge @@ -27,7 +28,7 @@ class HMDBLoader(SourceDataLoader): source_data_url = "https://hmdb.ca/downloads" license = "https://hmdb.ca/about" attribution = "https://hmdb.ca/about#cite" - parsing_version: str = '1.2' + parsing_version: str = '1.3' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -201,16 +202,14 @@ def get_genes(self, el, metabolite_id) -> bool: # what type of protein is this if protein_type.text.startswith('Enzyme'): # create the edge data - props: dict = {} subject_id: str = metabolite_id object_id: str = protein_id predicate: str = f'{CTD}:affects_abundance_of' # else it must be a transport? else: # create the edge data - props: dict = {} - subject_id: str = metabolite_id - object_id: str = protein_id + subject_id: str = protein_id + object_id: str = metabolite_id predicate: str = f'{CTD}:increases_transport_of' # get the name element @@ -226,12 +225,14 @@ def get_genes(self, el, metabolite_id) -> bool: new_node = kgxnode(protein_id, name=name) self.output_file_writer.write_kgx_node(new_node) + edge_props = {KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} # create an edge and add it to the list new_edge = kgxedge(subject_id, object_id, predicate=predicate, primary_knowledge_source=self.provenance_id, - edgeprops=props) + edgeprops=edge_props) self.output_file_writer.write_kgx_edge(new_edge) else: self.logger.debug(f'no protein type for {metabolite_id}') @@ -312,11 +313,12 @@ def get_diseases(self, el, metabolite_id) -> bool: pmids.append('PMID:' + pmid.text) # create the edge property data - props: dict = {} + edge_props = {KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} # if we found any pubmed ids add them to the properties (optional) if len(pmids) > 0: - props.update({'publications': pmids}) + edge_props[PUBLICATIONS] = pmids disease_id = f'{OMIM}:{object_id.text}' @@ -329,7 +331,7 @@ def get_diseases(self, el, metabolite_id) -> bool: disease_id, predicate='RO:0002610', primary_knowledge_source=self.provenance_id, - edgeprops=props) + edgeprops=edge_props) self.output_file_writer.write_kgx_edge(new_edge) ret_val = True else: @@ -391,11 +393,13 @@ def get_pathways(self, el, metabolite_id) -> bool: new_node = kgxnode(object_id, name=name) self.output_file_writer.write_kgx_node(new_node) - # create an edge and add it to the list + edge_props = {KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} new_edge = kgxedge(metabolite_id, object_id, predicate='RO:0000056', - primary_knowledge_source=self.provenance_id) + primary_knowledge_source=self.provenance_id, + edgeprops=edge_props) self.output_file_writer.write_kgx_edge(new_edge) else: self.logger.debug(f'invalid smpdb for {metabolite_id}') diff --git a/parsers/molepro/src/loadMolePro.py b/parsers/molepro/src/loadMolePro.py new file mode 100644 index 00000000..a6d32cac --- /dev/null +++ b/parsers/molepro/src/loadMolePro.py @@ -0,0 +1,146 @@ +import os + +from Common.biolink_constants import * +from Common.utils import GetData +from Common.loader_interface import SourceDataLoader + + +""" +NOTE these are in the molepro data file but aren't supported here (relation is deprecated) + +'biolink:relation' +'biolink:update_date' +'attributes' + +NOTE that FDA_approval_status is in the edges file headers but it should be highest_FDA_approval_status + +""" + + +class MoleProLoader(SourceDataLoader): + source_id: str = "MolePro" + provenance_id: str = "infores:molepro" + description = "MolePro!" + source_data_url = "" + license = "" + attribution = "" + parsing_version = "1.0" + + def __init__(self, test_mode: bool = False, source_data_dir: str = None): + """ + :param test_mode - sets the run into test mode + :param source_data_dir - the specific storage directory to save files in + """ + super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) + + self.data_url = ' https://molepro.s3.amazonaws.com/' + self.node_file_name: str = 'nodes.tsv' + self.edge_file_name: str = 'edges.tsv' + + self.data_files = [ + self.node_file_name, + self.edge_file_name + ] + + def get_latest_source_version(self) -> str: + return "1.0" + + def get_data(self) -> int: + data_puller = GetData() + for source in self.data_files: + source_url = f"{self.data_url}{source}" + data_puller.pull_via_http(source_url, self.data_path) + return True + + def parse_data(self) -> dict: + """ + Parses the data file for graph nodes/edges and writes them to the KGX files. + + :return: ret_val: record counts + """ + record_counter = 0 + skipped_record_counter = 0 + skipped_node_counter = 0 + + delimiter = '|' + node_property_indexes = {} + node_file_path: str = os.path.join(self.data_path, self.node_file_name) + with open(node_file_path, 'r') as node_file: + for line in node_file: + node_file_line = line.split('\t') + if not node_property_indexes: + # look at the file header and determine the indexes of the node properties, if they exist + # check for the properties with and without the biolink prefix + for node_property in BIOLINK_NODE_PROPERTIES + [f'biolink:{node_p}' for node_p in BIOLINK_NODE_PROPERTIES]: + try: + node_property_indexes[node_property] = node_file_line.index(node_property) + except ValueError: + pass + node_properties_to_split = [node_property.removeprefix('biolink:') for node_property in node_property_indexes + if node_property.removeprefix('biolink:') in BIOLINK_PROPERTIES_THAT_ARE_LISTS] + else: + # make a dictionary with the biolink properties on that line + next_node = { + node_property.removeprefix('biolink:'): node_file_line[node_property_indexes[node_property]] + for node_property in node_property_indexes if node_file_line[node_property_indexes[node_property]] + } + # check and make sure it has all the required node properties (except name could be empty) + if any(not next_node[node_property] for node_property in + REQUIRED_NODE_PROPERTIES if node_property is not NAME): + skipped_node_counter += 1 + continue + # convert the properties that should be lists to lists and split on a delimiter + for node_property in node_properties_to_split: + if node_property in next_node: + next_node[node_property] = next_node[node_property].split(delimiter) + # write the node to file + self.output_file_writer.write_node(node_id=next_node.pop(NODE_ID), + node_name=next_node.pop(NAME), + node_types=next_node.pop(NODE_TYPES), + node_properties=next_node) + + edge_property_indexes = {} + edge_file_path: str = os.path.join(self.data_path, self.edge_file_name) + with open(edge_file_path, 'r') as edge_file: + for line in edge_file: + edge_file_line = line.split('\t') + if not edge_property_indexes: + # look at the file header and determine the indexes of the edge properties, if they exist + for edge_property in BIOLINK_EDGE_PROPERTIES + [f'biolink:{edge_p}' for edge_p in BIOLINK_EDGE_PROPERTIES]: + try: + edge_property_indexes[edge_property] = edge_file_line.index(edge_property) + except ValueError: + pass + edge_properties_to_split = [edge_property.removeprefix('biolink:') for edge_property in edge_property_indexes + if edge_property.removeprefix('biolink:') in BIOLINK_PROPERTIES_THAT_ARE_LISTS] + else: + if self.test_mode and record_counter > 20000: + break + + # make a dictionary with the biolink properties on that line + next_edge = { + edge_property.removeprefix('biolink:'): edge_file_line[edge_property_indexes[edge_property]] + for edge_property in edge_property_indexes if edge_file_line[edge_property_indexes[edge_property]] + } + # check to make sure it has all the required properties + if any(not next_edge[edge_property] for edge_property in REQUIRED_EDGE_PROPERTIES): + skipped_record_counter += 1 + continue + # convert the properties that should be lists to lists and split on a delimiter + for edge_property in edge_properties_to_split: + if edge_property in next_edge: + next_edge[edge_property] = next_edge[edge_property].split(delimiter) + + # make sure there aren't multiple primary knowledge sources + next_edge[PRIMARY_KNOWLEDGE_SOURCE] = next_edge[PRIMARY_KNOWLEDGE_SOURCE].split('|')[0] + + # write the edge to file + self.output_file_writer.write_normalized_edge(next_edge) + record_counter += 1 + + # load up the metadata + load_metadata: dict = { + 'num_source_lines': record_counter, + 'unusable_source_lines': skipped_record_counter, + 'unusable_nodes': skipped_node_counter} + return load_metadata diff --git a/parsers/monarchkg/src/loadMonarchKG.py b/parsers/monarchkg/src/loadMonarchKG.py index 6a985ba1..2417cb2f 100644 --- a/parsers/monarchkg/src/loadMonarchKG.py +++ b/parsers/monarchkg/src/loadMonarchKG.py @@ -5,7 +5,7 @@ from Common.loader_interface import SourceDataLoader from Common.kgxmodel import kgxedge -from Common.node_types import PUBLICATIONS +from Common.biolink_constants import * from Common.utils import GetData @@ -18,7 +18,7 @@ class MonarchKGLoader(SourceDataLoader): source_id: str = 'MonarchKG' provenance_id: str = 'infores:monarchinitiative' - parsing_version: str = '1.0' + parsing_version: str = '1.1' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -29,7 +29,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): # there is a /latest/ for this url, but without a valid get_latest_source_version function, # it could create a mismatch, pin to this version for now - self.data_url = 'https://data.monarchinitiative.org/monarch-kg-dev/2023-11-16/' + self.data_url = 'https://data.monarchinitiative.org/monarch-kg-dev/2024-03-18/' self.monarch_graph_archive = 'monarch-kg.jsonl.tar.gz' self.monarch_edge_file_archive_path = 'monarch-kg_edges.jsonl' self.data_files = [self.monarch_graph_archive] @@ -65,7 +65,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): def get_latest_source_version(self) -> str: # possible to retrieve from /latest/index.html with beautifulsoup or some html parser but not ideal, # planning to try to set up a better method with owners - latest_version = '2023-11-16' + latest_version = '2024-03-18' return latest_version def get_data(self) -> bool: @@ -89,12 +89,13 @@ def parse_data(self) -> dict: with tar_files.extractfile(self.monarch_edge_file_archive_path) as edges_file: for line in edges_file: monarch_edge = orjson.loads(line) - subject_id = monarch_edge['subject'] - object_id = monarch_edge['object'] - predicate = monarch_edge['predicate'] + # normally we wouldn't use constants to read FROM a source, + # but in this case monarch kg is biolink compliant, so they should be the same + subject_id = monarch_edge[SUBJECT_ID] + object_id = monarch_edge[OBJECT_ID] + predicate = monarch_edge[PREDICATE] if not (subject_id and object_id and predicate): skipped_bad_record_counter += 1 - print(line) continue if predicate not in self.desired_predicates: @@ -103,26 +104,32 @@ def parse_data(self) -> dict: # get the knowledge sources, map them to something else if needed, # then check if edge should be ignored due to the knowledge source - primary_knowledge_source = self.knowledge_source_mapping.get(monarch_edge['primary_knowledge_source'], - monarch_edge['primary_knowledge_source']) - aggregator_knowledge_sources = [self.knowledge_source_mapping.get(ks, ks) for ks in monarch_edge['aggregator_knowledge_source']] + primary_knowledge_source = self.knowledge_source_mapping.get(monarch_edge[PRIMARY_KNOWLEDGE_SOURCE], + monarch_edge[PRIMARY_KNOWLEDGE_SOURCE]) + aggregator_knowledge_sources = [self.knowledge_source_mapping.get(ks, ks) for ks in monarch_edge[AGGREGATOR_KNOWLEDGE_SOURCES]] if primary_knowledge_source in self.knowledge_source_ignore_list or \ any([ks in self.knowledge_source_ignore_list for ks in aggregator_knowledge_sources]): skipped_ignore_knowledge_source += 1 continue - edge_properties = {} - if monarch_edge['publications']: - edge_properties[PUBLICATIONS] = monarch_edge['publications'] + edge_properties = { + KNOWLEDGE_LEVEL: monarch_edge[KNOWLEDGE_LEVEL] if KNOWLEDGE_LEVEL in monarch_edge else NOT_PROVIDED, + AGENT_TYPE: monarch_edge[AGENT_TYPE] if AGENT_TYPE in monarch_edge else NOT_PROVIDED + } + if monarch_edge[PUBLICATIONS]: + edge_properties[PUBLICATIONS] = monarch_edge[PUBLICATIONS] for edge_attribute in monarch_edge: if '_qualifier' in edge_attribute and monarch_edge[edge_attribute]: edge_properties[edge_attribute] = monarch_edge[edge_attribute] + elif edge_attribute == QUALIFIED_PREDICATE and monarch_edge[QUALIFIED_PREDICATE]: + edge_properties[QUALIFIED_PREDICATE] = monarch_edge[QUALIFIED_PREDICATE] output_edge = kgxedge( subject_id=subject_id, predicate=predicate, object_id=object_id, primary_knowledge_source=primary_knowledge_source, - aggregator_knowledge_sources=aggregator_knowledge_sources + aggregator_knowledge_sources=aggregator_knowledge_sources, + edgeprops=edge_properties ) self.output_file_writer.write_node(object_id) self.output_file_writer.write_node(subject_id) diff --git a/parsers/panther/src/loadPanther.py b/parsers/panther/src/loadPanther.py index e5352184..ec8144f0 100644 --- a/parsers/panther/src/loadPanther.py +++ b/parsers/panther/src/loadPanther.py @@ -6,6 +6,7 @@ import requests from bs4 import BeautifulSoup +from Common.biolink_constants import * from Common.utils import GetData from Common.loader_interface import SourceDataLoader from Common.kgxmodel import kgxnode, kgxedge @@ -38,7 +39,7 @@ class PLoader(SourceDataLoader): source_data_url = "ftp.pantherdb.org/sequence_classifications/" license = "http://pantherdb.org/tou.jsp" attribution = "http://pantherdb.org/publications.jsp#HowToCitePANTHER" - parsing_version: str = '1.1' + parsing_version: str = '1.2' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -114,6 +115,7 @@ def get_data(self) -> int: # do the real thing if we arent in debug mode if not self.test_mode: # get the complete data set + #TODO make these class level variables. file_count: int = gd.pull_via_ftp('ftp.pantherdb.org', f'/sequence_classifications/{self.data_version}/PANTHER_Sequence_Classification_files/', [self.data_file], self.data_path) else: file_count: int = 1 @@ -261,10 +263,13 @@ def get_gene_family_by_gene_family(self, family): self.final_node_list.append(gene_sub_family_node) # create the edge + edge_properties = {KNOWLEDGE_LEVEL: NOT_PROVIDED, + AGENT_TYPE: NOT_PROVIDED} new_edge = kgxedge(subject_id=g_sub_fam_id, predicate='BFO:0000050', object_id=family.identifier, - primary_knowledge_source=self.provenance_id) + primary_knowledge_source=self.provenance_id, + edgeprops=edge_properties) self.final_edge_list.append(new_edge) def get_gene_by_gene_family(self, family): @@ -286,10 +291,13 @@ def get_gene_by_gene_family(self, family): self.final_node_list.append(gene_node) # create the edge + edge_properties = {KNOWLEDGE_LEVEL: NOT_PROVIDED, + AGENT_TYPE: NOT_PROVIDED} gene_family_edge = kgxedge(subject_id=gene_id, predicate='BFO:0000050', object_id=family.identifier, - primary_knowledge_source=self.provenance_id) + primary_knowledge_source=self.provenance_id, + edgeprops=edge_properties) self.final_edge_list.append(gene_family_edge) def get_biological_process_or_activity_by_gene_family(self, family): @@ -310,10 +318,13 @@ def get_biological_process_or_activity_by_gene_family(self, family): self.final_node_list.append(new_node) # create the gene_family-biological_process_or_activity edge + edge_properties = {KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} new_edge = kgxedge(subject_id=family.identifier, predicate='RO:0002331', object_id=bio_p_id, - primary_knowledge_source=self.provenance_id) + primary_knowledge_source=self.provenance_id, + edgeprops=edge_properties) self.final_edge_list.append(new_edge) def get_molecular_function_by_gene_family(self, family): @@ -334,10 +345,13 @@ def get_molecular_function_by_gene_family(self, family): self.final_node_list.append(new_node) # create the gene_family-molecular function edge + edge_properties = {KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT} new_edge = kgxedge(subject_id=family.identifier, predicate='RO:0002327', object_id=mole_func_id, - primary_knowledge_source=self.provenance_id) + primary_knowledge_source=self.provenance_id, + edgeprops=edge_properties) self.final_edge_list.append(new_edge) def get_cellular_component_by_gene_family(self, family): @@ -358,10 +372,13 @@ def get_cellular_component_by_gene_family(self, family): self.final_node_list.append(new_node) # create the gene_family-cellular_component edge + edge_properties = {KNOWLEDGE_LEVEL: NOT_PROVIDED, + AGENT_TYPE: NOT_PROVIDED} new_edge = kgxedge(subject_id=family.identifier, predicate='RO:0001025', object_id=cellular_component_id, - primary_knowledge_source=self.provenance_id) + primary_knowledge_source=self.provenance_id, + edgeprops=edge_properties) self.final_edge_list.append(new_edge) def get_pathway_by_gene_family(self, family): @@ -384,10 +401,13 @@ def get_pathway_by_gene_family(self, family): self.final_node_list.append(new_node) # create the gene_family-pathway edge + edge_properties = {KNOWLEDGE_LEVEL: NOT_PROVIDED, + AGENT_TYPE: NOT_PROVIDED} new_edge = kgxedge(subject_id=panther_pathway_id, predicate='RO:0000057', object_id=family.identifier, - primary_knowledge_source=self.provenance_id) + primary_knowledge_source=self.provenance_id, + edgeprops=edge_properties) self.final_edge_list.append(new_edge) def get_gene_id_from_row(self, row): diff --git a/parsers/scent/src/loadScent.py b/parsers/scent/src/loadScent.py index 41ffd624..b3df7ce4 100644 --- a/parsers/scent/src/loadScent.py +++ b/parsers/scent/src/loadScent.py @@ -4,7 +4,7 @@ from Common.utils import GetData from Common.loader_interface import SourceDataLoader from Common.extractor import Extractor -from Common.node_types import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES +from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES # the scent odorant edge header columns: diff --git a/parsers/textminingkp/src/loadTMKP.py b/parsers/textminingkp/src/loadTMKP.py index 35d62a15..82a96c87 100644 --- a/parsers/textminingkp/src/loadTMKP.py +++ b/parsers/textminingkp/src/loadTMKP.py @@ -3,7 +3,7 @@ import argparse import enum -from Common.node_types import PUBLICATIONS +from Common.biolink_constants import * from Common.utils import GetData from Common.kgxmodel import kgxedge from Common.loader_interface import SourceDataLoader @@ -145,7 +145,9 @@ def parse_data(self) -> dict: edge_props = {PUBLICATIONS: [paper_id for paper_id in paper_idxs.split('|')], "biolink:tmkp_confidence_score": float(confidence_score), "sentences": "|".join(sentences), - "tmkp_ids": [tmkp_id for tmkp_id in tmpk_idxs.split('|')]} + "tmkp_ids": [tmkp_id for tmkp_id in tmpk_idxs.split('|')], + KNOWLEDGE_LEVEL: NOT_PROVIDED, + AGENT_TYPE: TEXT_MINING_AGENT} # look for any qualifiers and add them to edge_props if they have values for qualifier_index, qualifier_attribute in TMKP_QUALIFIER_ATTRIBUTES.items(): diff --git a/parsers/yeast/src/loadCostanza2016.py b/parsers/yeast/src/loadCostanza2016.py index d01b576c..64bee95d 100644 --- a/parsers/yeast/src/loadCostanza2016.py +++ b/parsers/yeast/src/loadCostanza2016.py @@ -4,7 +4,7 @@ import requests from Common.loader_interface import SourceDataLoader from Common.extractor import Extractor -from Common.node_types import PRIMARY_KNOWLEDGE_SOURCE, PUBLICATIONS +from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE, PUBLICATIONS from Common.prefixes import PUBMED from intermine.webservice import Service @@ -31,7 +31,7 @@ class Costanza2016Loader(SourceDataLoader): source_id: str = 'Costanza2016Data' provenance_id: str = 'infores:CostanzaGeneticInteractions' - parsing_version = '1.1' + parsing_version = '1.2' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -144,7 +144,7 @@ def parse_data(self) -> dict: with open(costanza_genetic_interactions, 'r') as fp: extractor.csv_extract(fp, lambda line: f"SGD:{line[COSTANZA_GENEINTERACTIONS.GENE1.value]}-{line[COSTANZA_GENEINTERACTIONS.GENE2.value]}", # subject id - lambda line: line[COSTANZA_GENEINTERACTIONS.GENE1.value], # object id + lambda line: f"SGD:{line[COSTANZA_GENEINTERACTIONS.GENE1.value]}", # object id lambda line: "biolink:has_part", # predicate extractor lambda line: {}, # subject props lambda line: {}, # object props @@ -160,7 +160,7 @@ def parse_data(self) -> dict: with open(costanza_genetic_interactions, 'r') as fp: extractor.csv_extract(fp, lambda line: f"SGD:{line[COSTANZA_GENEINTERACTIONS.GENE1.value]}-{line[COSTANZA_GENEINTERACTIONS.GENE2.value]}", # subject id - lambda line: line[COSTANZA_GENEINTERACTIONS.GENE2.value], # object id + lambda line: f"SGD:{line[COSTANZA_GENEINTERACTIONS.GENE2.value]}", # object id lambda line: "biolink:has_part", # predicate extractor lambda line: {}, # subject props lambda line: {}, # object props diff --git a/parsers/yeast/src/loadHistoneMap.py b/parsers/yeast/src/loadHistoneMap.py index d3e3000b..84859adc 100644 --- a/parsers/yeast/src/loadHistoneMap.py +++ b/parsers/yeast/src/loadHistoneMap.py @@ -7,7 +7,7 @@ from parsers.yeast.src.yeast_constants import YEAST_GENOME_RESOLUTION, SGD_ALL_GENES_FILE, HISTONE_LOCI_FILE from Common.loader_interface import SourceDataLoader from Common.extractor import Extractor -from Common.node_types import PRIMARY_KNOWLEDGE_SOURCE +from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE #List of Binned Histone Modifications class HISTONEMODBINS_EDGEUMAN(enum.IntEnum): @@ -275,9 +275,8 @@ def fetch_histone_data(self, "CTD:increases_abundance_of"] self.logger.debug('Histone Modifications Mapped to GO Terms!') - csv_fname = f"HistonePTM2GO.csv" histonePTM2GO_df = pd.DataFrame.from_dict(histonePTM2GO) - histonePTM2GO_df.to_csv(os.path.join(output_directory, csv_fname), encoding="utf-8-sig", index=False) + histonePTM2GO_df.to_csv(os.path.join(output_directory, self.histone_mod_to_go_term_file_name), encoding="utf-8-sig", index=False) for chr in chromosome_lengths.keys(): m = int(chromosome_lengths[chr]) for i in range(m): # Create loci nodes for chromosomes @@ -303,7 +302,7 @@ def fetch_histone_data(self, data['histoneMod'].append(ptm) genomelocidf = pd.DataFrame(data) self.logger.debug('Histone Modifications Loci Collected!') - genomelocidf.to_csv(os.path.join(output_directory, HISTONE_LOCI_FILE), encoding="utf-8-sig", index=False) + genomelocidf.to_csv(os.path.join(output_directory, self.histone_mod_list_file_name), encoding="utf-8-sig", index=False) if not generate_gene_mapping: return @@ -336,5 +335,4 @@ def fetch_histone_data(self, genomelocidf = genomelocidf.merge(just_windows, how='inner', on=['chromosomeID', 'start', 'end', 'loci']) self.logger.debug(f"Histone Modifications Mapping Complete!") - csv_f3name = f"HistoneMod2Gene.csv" - genomelocidf.to_csv(os.path.join(output_directory, csv_f3name), encoding="utf-8-sig", index=False) + genomelocidf.to_csv(os.path.join(output_directory, self.histone_mod_to_gene_file_name), encoding="utf-8-sig", index=False) diff --git a/parsers/yeast/src/loadYeastGeneExpressionGasch.py b/parsers/yeast/src/loadYeastGeneExpressionGasch.py index f00ca0e8..968f10ca 100644 --- a/parsers/yeast/src/loadYeastGeneExpressionGasch.py +++ b/parsers/yeast/src/loadYeastGeneExpressionGasch.py @@ -5,7 +5,7 @@ from parsers.SGD.src.sgd_source_retriever import SGDAllGenes from Common.loader_interface import SourceDataLoader from Common.extractor import Extractor -from Common.node_types import PRIMARY_KNOWLEDGE_SOURCE +from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE # Maps Experimental Condition affects Nucleosome edge. diff --git a/parsers/yeast/src/loadYeastNucleosomesGSE61888.py b/parsers/yeast/src/loadYeastNucleosomesGSE61888.py index 72207521..b094bd89 100644 --- a/parsers/yeast/src/loadYeastNucleosomesGSE61888.py +++ b/parsers/yeast/src/loadYeastNucleosomesGSE61888.py @@ -5,7 +5,7 @@ from Common.utils import GetData, int_to_roman_numeral from Common.loader_interface import SourceDataLoader from Common.extractor import Extractor -from Common.node_types import PRIMARY_KNOWLEDGE_SOURCE +from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE from parsers.yeast.src.yeast_constants import HISTONE_LOCI_FILE, YEAST_GENOME_RESOLUTION from parsers.yeast.src.loadHistoneMap import YeastHistoneMapLoader diff --git a/requirements.txt b/requirements.txt index 36c3b823..519dcb40 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,18 @@ -pandas>=1.5.0 -requests>=2.28.1 -pytest>=7.1.3 -git+https://github.com/ObesityHub/robokop-genetics.git +pandas==2.2.2 +requests==2.32.3 +pytest==8.2.0 +robokop-genetics==0.5.0 # intermine is on pypi but as of 6/23 it's broken for python 3.10+, this fork fixes the issue git+https://github.com/EvanDietzMorris/intermine-ws-python.git -jsonlines>=3.1.0 -pyyaml==6.0 -beautifulsoup4==4.11.1 -psycopg2-binary>=2.9.3 -orjson==3.9.15 -xxhash==3.0.0 -mysql-connector-python==8.0.28 -neo4j==5.10.0 -pyoxigraph==0.3.6 -curies>=0.4.0 -prefixmaps>=0.1.4 -bmt==1.0.15 +jsonlines==4.0.0 +pyyaml==6.0.1 +beautifulsoup4==4.12.3 +psycopg2-binary==2.9.9 +orjson==3.10.3 +xxhash==3.4.1 +mysql-connector-python==8.4.0 +neo4j==5.20.0 +pyoxigraph==0.3.22 +curies==0.7.9 +prefixmaps==0.2.4 +bmt==1.4.1 diff --git a/set_up_test_env.sh b/set_up_test_env.sh index 39d305cb..1ef6edd3 100644 --- a/set_up_test_env.sh +++ b/set_up_test_env.sh @@ -22,12 +22,15 @@ export ORION_GRAPH_SPEC=testing-graph-spec.yml #ORION_GRAPH_SPEC_URL - a URL pointing to a Graph Spec file #export ORION_GRAPH_SPEC_URL=https://raw.githubusercontent.com/RENCI-AUTOMAT/ORION/helm_deploy/graph_specs/yeast-graph-spec.yml -export ORION_NEO4J_PASSWORD=insecurepasswordexample +export PYTHONPATH="$PYTHONPATH:$PWD" -export ORION_OUTPUT_URL=https://localhost/ #The following environment variables are optional -export EDGE_NORMALIZATION_ENDPOINT=https://bl-lookup-sri.renci.org/ -export NODE_NORMALIZATION_ENDPOINT=https://nodenormalization-sri.renci.org/ - -export PYTHONPATH="$PYTHONPATH:$PWD" \ No newline at end of file +# export EDGE_NORMALIZATION_ENDPOINT=https://bl-lookup-sri.renci.org/ +# export NODE_NORMALIZATION_ENDPOINT=https://nodenormalization-sri.renci.org/ +# export NAME_RESOLVER_ENDPOINT=https://name-resolution-sri.renci.org/ +# export ORION_OUTPUT_URL=https://localhost/ # this is currently only used to generate metadata +# export BL_VERSION=4.2.1 + +# if you are building your own docker image and issues occur, setting the correct platform may help +# export DOCKER_PLATFORM=linux/arm64 diff --git a/tests/test_file_writer.py b/tests/test_file_writer.py index 10074435..f037602c 100644 --- a/tests/test_file_writer.py +++ b/tests/test_file_writer.py @@ -3,7 +3,7 @@ from Common.utils import quick_jsonl_file_iterator from Common.kgx_file_writer import KGXFileWriter from Common.kgxmodel import kgxnode, kgxedge -from Common.node_types import * +from Common.biolink_constants import * test_workspace_dir = os.path.dirname(os.path.abspath(__file__)) + '/workspace/' # TODO this is hacky and should be done with better design in pytest or somewhere else diff --git a/tests/test_merging.py b/tests/test_merging.py index f3647075..fc71fa5c 100644 --- a/tests/test_merging.py +++ b/tests/test_merging.py @@ -1,5 +1,5 @@ from Common.merging import GraphMerger, MemoryGraphMerger, DiskGraphMerger -from Common.node_types import * +from Common.biolink_constants import * import os import json @@ -25,6 +25,7 @@ def node_property_merging_test(graph_merger: GraphMerger): assert 'SYN_X' in merged_node[SYNONYMS] and 'SYN_5' in merged_node[SYNONYMS] assert len(merged_node[NODE_TYPES]) == 1 + def test_node_property_merging_in_memory(): node_property_merging_test(MemoryGraphMerger()) @@ -146,3 +147,55 @@ def test_edge_merging_counts_in_memory(): def test_edge_merging_counts_on_disk(): edge_merging_counts_test(DiskGraphMerger(temp_directory=TEMP_DIRECTORY, chunk_size=8)) + + +def test_qualifier_edge_merging(): + + test_edges_up = [{SUBJECT_ID: f'NODE:1', + PREDICATE: 'testing:predicate', + OBJECT_ID: f'NODE:2', + SUBJECT_ASPECT_QUALIFIER: f'test_aspect', + SUBJECT_DIRECTION_QUALIFIER: 'up', + 'testing_prop': [i]} + for i in range(1, 16)] + + test_edges_down = [{SUBJECT_ID: f'NODE:1', + PREDICATE: 'testing:predicate', + OBJECT_ID: f'NODE:2', + SUBJECT_ASPECT_QUALIFIER: f'test_aspect', + SUBJECT_DIRECTION_QUALIFIER: 'down', + 'testing_prop': [i]} + for i in range(1, 11)] + + test_edges_other = [{SUBJECT_ID: f'NODE:1', + PREDICATE: 'testing:predicate', + OBJECT_ID: f'NODE:2', + SUBJECT_ASPECT_QUALIFIER: f'test_aspect', + SUBJECT_DIRECTION_QUALIFIER: 'down', + SPECIES_CONTEXT_QUALIFIER: 'test_species', + 'testing_prop': [i]} + for i in range(1, 6)] + graph_merger = MemoryGraphMerger() + graph_merger.merge_edges(test_edges_up) + graph_merger.merge_edges(test_edges_down) + graph_merger.merge_edges(test_edges_other) + + merged_edges = [json.loads(edge) for edge in graph_merger.get_merged_edges_jsonl()] + assert len(merged_edges) == 3 + + passed_tests = 0 + for edge in merged_edges: + if edge[SUBJECT_DIRECTION_QUALIFIER] == 'up': + assert len(edge['testing_prop']) == 15 + assert SPECIES_CONTEXT_QUALIFIER not in edge + passed_tests += 1 + elif edge[SUBJECT_DIRECTION_QUALIFIER] == 'down' and SPECIES_CONTEXT_QUALIFIER not in edge: + assert len(edge['testing_prop']) == 10 + passed_tests += 1 + elif edge[SUBJECT_DIRECTION_QUALIFIER] == 'down' and SPECIES_CONTEXT_QUALIFIER in edge: + assert len(edge['testing_prop']) == 5 + passed_tests += 1 + + assert passed_tests == 3 + + diff --git a/tests/test_normalization.py b/tests/test_normalization.py index ddf74632..0fe347b6 100644 --- a/tests/test_normalization.py +++ b/tests/test_normalization.py @@ -1,7 +1,7 @@ import pytest -from Common.normalization import NodeNormalizer, EdgeNormalizer, EdgeNormalizationResult -from Common.node_types import ROOT_ENTITY, \ - GENE, SEQUENCE_VARIANT, FALLBACK_EDGE_PREDICATE, CUSTOM_NODE_TYPES, INFORMATION_CONTENT, NODE_TYPES +from Common.biolink_constants import NAMED_THING, GENE, SEQUENCE_VARIANT, INFORMATION_CONTENT, NODE_TYPES +from Common.normalization import NodeNormalizer, EdgeNormalizer, EdgeNormalizationResult, \ + FALLBACK_EDGE_PREDICATE, CUSTOM_NODE_TYPES INVALID_NODE_TYPE = "testing:Type1" @@ -22,7 +22,7 @@ def test_nodes(): {"id": "ENSEMBL:testing_id", "name": "broken gene", NODE_TYPES: [GENE, GENE, INVALID_NODE_TYPE]}, {"id": "TESTING:testing_id", "name": "broken gene 2", NODE_TYPES: [INVALID_NODE_TYPE]}, {"id": "TESTING:nameless", "name": "", NODE_TYPES: [INVALID_NODE_TYPE], "test_prop": 1}, - {"id": "CHEBI:33551", NODE_TYPES: [ROOT_ENTITY]} + {"id": "CHEBI:33551", NODE_TYPES: [NAMED_THING]} ] return nodes @@ -38,7 +38,7 @@ def test_node_norm(test_nodes): normalized_node = get_node_from_list(normalized_id, test_nodes) assert normalized_node is not None assert GENE in normalized_node[NODE_TYPES] - assert ROOT_ENTITY in normalized_node[NODE_TYPES] + assert NAMED_THING in normalized_node[NODE_TYPES] assert CUSTOM_NODE_TYPES not in normalized_node assert normalized_node['test_prop'] == 1 @@ -65,20 +65,20 @@ def test_node_norm_lenient(test_nodes): assert normalized_id == correct_normalized_id normalized_node = get_node_from_list(normalized_id, test_nodes) assert normalized_node is not None - assert ROOT_ENTITY in normalized_node[NODE_TYPES] + assert NAMED_THING in normalized_node[NODE_TYPES] assert INVALID_NODE_TYPE in normalized_node[CUSTOM_NODE_TYPES] assert normalized_node['test_prop'] == 1 normalized_id = node_normalizer.node_normalization_lookup['ENSEMBL:testing_id'][0] normalized_node = get_node_from_list(normalized_id, test_nodes) assert INVALID_NODE_TYPE in normalized_node[CUSTOM_NODE_TYPES] - assert ROOT_ENTITY in normalized_node[NODE_TYPES] - assert len(normalized_node[NODE_TYPES]) == 2 # should be GENE and ROOT_ENTITY + assert NAMED_THING in normalized_node[NODE_TYPES] + assert len(normalized_node[NODE_TYPES]) == 2 # should be GENE and NAMED_THING normalized_id = node_normalizer.node_normalization_lookup['TESTING:nameless'][0] normalized_node = get_node_from_list(normalized_id, test_nodes) assert INVALID_NODE_TYPE in normalized_node[CUSTOM_NODE_TYPES] - assert ROOT_ENTITY in normalized_node[NODE_TYPES] + assert NAMED_THING in normalized_node[NODE_TYPES] assert normalized_node['name'] == 'nameless' @@ -129,7 +129,7 @@ def test_variant_node_norm(): assert len(variant_nodes_2) == 13 bogus_node_after_normalization = get_node_from_list('BOGUS:rs999999999999', variant_nodes_2) assert bogus_node_after_normalization['name'] == 'BOGUS:rs999999999999' - assert ROOT_ENTITY in bogus_node_after_normalization[NODE_TYPES] + assert NAMED_THING in bogus_node_after_normalization[NODE_TYPES] assert SEQUENCE_VARIANT in bogus_node_after_normalization[NODE_TYPES]