adding may treat kp

RobokopU24 · Sep 9, 2024 · ae386da · ae386da
1 parent b124dfb
commit ae386da
Show file tree

Hide file tree

Showing 3 changed files with 97 additions and 0 deletions.
diff --git a/Common/data_sources.py b/Common/data_sources.py
@@ -23,6 +23,7 @@
 LITCOIN_SAPBERT = 'LitCoinSapBERT'
 LITCOIN_ENTITY_EXTRACTOR = 'LitCoinEntityExtractor'
 KINACE = 'KinAce'
+MAYTREATKP = 'MayTreatKP'
 MOLEPRO = 'MolePro'
 MONARCH_KG = 'MonarchKG'
 MONDO_PROPS = 'MONDOProps'
@@ -71,6 +72,7 @@
     LITCOIN_ENTITY_EXTRACTOR: ("parsers.LitCoin.src.loadLitCoin", "LitCoinEntityExtractorLoader"),
     LITCOIN_SAPBERT: ("parsers.LitCoin.src.loadLitCoin", "LitCoinSapBERTLoader"),
     KINACE: ("parsers.KinAce.src.loadKinAce", "KinAceLoader"),
+    MAYTREATKP: ("parsers.MayTreat.src.loadMayTreat", "MayTreatLoader"),
     MOLEPRO: ("parsers.molepro.src.loadMolePro", "MoleProLoader"),
     MONARCH_KG: ("parsers.monarchkg.src.loadMonarchKG", "MonarchKGLoader"),
     MONDO_PROPS: ("parsers.MONDOProperties.src.loadMP", "MPLoader"),

diff --git a/graph_specs/default-graph-spec.yml b/graph_specs/default-graph-spec.yml
@@ -205,6 +205,14 @@ graphs:
       - source_id: OntologicalHierarchy
         merge_strategy: connected_edge_subset
 
+  - graph_id: MayTreatKP_Automat
+    graph_name: Electronic Health Record (EHR) May Treat KP
+    graph_description: 'The Electronic Health Record (EHR) May Treat KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. This May Treat KP provides a knowledge graph pointing from medications to a variety health outcomes specifically for diseases prior of treatments. We use data from over 28 million EHRs to train a large collection of interpretable machine learning models which are integrated into a single large knowledge graph. The edges of the graph are generated by running ~300 logistic regression models for clinical conditions with features including age, sex, medical conditions, and medications as nodes to predict associations with disease outcome.'
+    graph_url: https://github.com/NCATSTranslator/Translator-All/wiki/Multiomics-EHRMLA-May-Treat-KP
+    output_format: neo4j
+    sources:
+      - source_id: MayTreatKP
+
   - graph_id: PANTHER_Automat
     graph_name: PANTHER
     graph_description: 'The Protein ANalysis THrough Evolutionary Relationships (PANTHER) classification system provides an openly available annotation library of gene family phylogenetic trees, with persistent identifiers attached to all nodes in the trees and annotation of each protein member of the family by its family and protein class, subfamily, orthologs, paralogs, GO Phylogenetic Annotation Project function, and Reactome pathways.'

diff --git a/parsers/MayTreat/src/loadMayTreat.py b/parsers/MayTreat/src/loadMayTreat.py
@@ -0,0 +1,87 @@
+import os
+import requests
+import yaml
+import csv
+import json
+
+from Common.utils import GetData
+from Common.loader_interface import SourceDataLoader
+
+
+class MayTreatLoader(SourceDataLoader):
+    source_id: str = "MayTreatKP"
+    provenance_id: str = "infores:isb-EHRMLA-data"
+    description = "Multiomics EHRMLA May Treat KP."
+    source_data_url = "https://github.com/NCATSTranslator/Translator-All/wiki/Multiomics-EHRMLA-May-Treat-KP"
+    license = "https://github.com/NCATSTranslator/Translator-All/wiki/Multiomics-EHRMLA-May-Treat-KP"
+    attribution = "https://github.com/NCATSTranslator/Translator-All/wiki/Multiomics-EHRMLA-May-Treat-KP"
+    parsing_version = "1.0"
+
+    def __init__(self, test_mode: bool = False, source_data_dir: str = None):
+        """
+        :param test_mode - sets the run into test mode
+        :param source_data_dir - the specific storage directory to save files in
+        """
+        super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)
+
+        self.data_url: str = 'https://stars.renci.org/var/data_services/ehrmaytreatkp/'
+        self.edge_file_name: str = 'edges.csv'
+        self.data_file = self.edge_file_name
+        self.version_file = 'ehr-may-treat-kp.yaml'
+
+        self.edge_file_property_ignore_list = ['subject_name', 'object_name', 'KG_type', 'category', '']
+        self.edge_file_json_properties = ['log_odds_ratio_95_ci']
+        self.edge_file_float_properties = ['auc_roc', 'log_odds_ratio', 'log_odds_ratio_95_ci_lower',
+                                           'log_odds_ratio_95_ci_upper', 'adjusted_p_value', ]
+        self.edge_file_int_properties = ['positive_patient_count', 'negative_patient_count', 'total_sample_size']
+
+    def get_latest_source_version(self) -> str:
+        version_file_url = f"{self.data_url}{self.version_file}"
+        r = requests.get(version_file_url)
+        version_yaml = yaml.full_load(r.text)
+        build_version = str(version_yaml['build'])
+        return build_version
+
+    def get_data(self) -> int:
+        data_puller = GetData()
+        source_url = f"{self.data_url}{self.data_file}"
+        data_puller.pull_via_http(source_url, self.data_path)
+        return True
+
+    def parse_data(self) -> dict:
+        """
+        Parses the data file for graph nodes/edges and writes them to the KGX files.
+
+        :return: ret_val: record counts
+        """
+        record_counter = 0
+        skipped_record_counter = 0
+
+        edge_file_path: str = os.path.join(self.data_path, self.edge_file_name)
+        with open(edge_file_path, 'r', newline='') as edge_file:
+
+            csv_reader = csv.DictReader(edge_file, quotechar='"')
+            for edge in csv_reader:
+                try:
+                    for prop in self.edge_file_property_ignore_list:
+                        edge.pop(prop, None)
+                    for edge_prop in edge:
+                        if edge_prop in self.edge_file_json_properties:
+                            edge[edge_prop] = json.loads(edge[edge_prop])
+                        elif edge_prop in self.edge_file_float_properties:
+                            edge[edge_prop] = float(edge[edge_prop])
+                        elif edge_prop in self.edge_file_int_properties:
+                            edge[edge_prop] = int(edge[edge_prop])
+                    self.output_file_writer.write_node(edge['subject'])
+                    self.output_file_writer.write_node(edge['object'])
+                    self.output_file_writer.write_normalized_edge(edge)
+                    record_counter += 1
+                except (ValueError, KeyError) as e:
+                    self.logger.error(str(e))
+                    skipped_record_counter += 1
+
+        # load up the metadata
+        load_metadata: dict = {
+            'num_source_lines': record_counter,
+            'unusable_source_lines': skipped_record_counter}
+        return load_metadata