Skip to content

Commit

Permalink
adding may treat kp
Browse files Browse the repository at this point in the history
  • Loading branch information
EvanDietzMorris committed Sep 9, 2024
1 parent b124dfb commit ae386da
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 0 deletions.
2 changes: 2 additions & 0 deletions Common/data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
LITCOIN_SAPBERT = 'LitCoinSapBERT'
LITCOIN_ENTITY_EXTRACTOR = 'LitCoinEntityExtractor'
KINACE = 'KinAce'
MAYTREATKP = 'MayTreatKP'
MOLEPRO = 'MolePro'
MONARCH_KG = 'MonarchKG'
MONDO_PROPS = 'MONDOProps'
Expand Down Expand Up @@ -71,6 +72,7 @@
LITCOIN_ENTITY_EXTRACTOR: ("parsers.LitCoin.src.loadLitCoin", "LitCoinEntityExtractorLoader"),
LITCOIN_SAPBERT: ("parsers.LitCoin.src.loadLitCoin", "LitCoinSapBERTLoader"),
KINACE: ("parsers.KinAce.src.loadKinAce", "KinAceLoader"),
MAYTREATKP: ("parsers.MayTreat.src.loadMayTreat", "MayTreatLoader"),
MOLEPRO: ("parsers.molepro.src.loadMolePro", "MoleProLoader"),
MONARCH_KG: ("parsers.monarchkg.src.loadMonarchKG", "MonarchKGLoader"),
MONDO_PROPS: ("parsers.MONDOProperties.src.loadMP", "MPLoader"),
Expand Down
8 changes: 8 additions & 0 deletions graph_specs/default-graph-spec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,14 @@ graphs:
- source_id: OntologicalHierarchy
merge_strategy: connected_edge_subset

- graph_id: MayTreatKP_Automat
graph_name: Electronic Health Record (EHR) May Treat KP
graph_description: 'The Electronic Health Record (EHR) May Treat KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. This May Treat KP provides a knowledge graph pointing from medications to a variety health outcomes specifically for diseases prior of treatments. We use data from over 28 million EHRs to train a large collection of interpretable machine learning models which are integrated into a single large knowledge graph. The edges of the graph are generated by running ~300 logistic regression models for clinical conditions with features including age, sex, medical conditions, and medications as nodes to predict associations with disease outcome.'
graph_url: https://github.com/NCATSTranslator/Translator-All/wiki/Multiomics-EHRMLA-May-Treat-KP
output_format: neo4j
sources:
- source_id: MayTreatKP

- graph_id: PANTHER_Automat
graph_name: PANTHER
graph_description: 'The Protein ANalysis THrough Evolutionary Relationships (PANTHER) classification system provides an openly available annotation library of gene family phylogenetic trees, with persistent identifiers attached to all nodes in the trees and annotation of each protein member of the family by its family and protein class, subfamily, orthologs, paralogs, GO Phylogenetic Annotation Project function, and Reactome pathways.'
Expand Down
87 changes: 87 additions & 0 deletions parsers/MayTreat/src/loadMayTreat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import os
import requests
import yaml
import csv
import json

from Common.utils import GetData
from Common.loader_interface import SourceDataLoader


class MayTreatLoader(SourceDataLoader):
source_id: str = "MayTreatKP"
provenance_id: str = "infores:isb-EHRMLA-data"
description = "Multiomics EHRMLA May Treat KP."
source_data_url = "https://github.com/NCATSTranslator/Translator-All/wiki/Multiomics-EHRMLA-May-Treat-KP"
license = "https://github.com/NCATSTranslator/Translator-All/wiki/Multiomics-EHRMLA-May-Treat-KP"
attribution = "https://github.com/NCATSTranslator/Translator-All/wiki/Multiomics-EHRMLA-May-Treat-KP"
parsing_version = "1.0"

def __init__(self, test_mode: bool = False, source_data_dir: str = None):
"""
:param test_mode - sets the run into test mode
:param source_data_dir - the specific storage directory to save files in
"""
super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)

self.data_url: str = 'https://stars.renci.org/var/data_services/ehrmaytreatkp/'
self.edge_file_name: str = 'edges.csv'
self.data_file = self.edge_file_name
self.version_file = 'ehr-may-treat-kp.yaml'

self.edge_file_property_ignore_list = ['subject_name', 'object_name', 'KG_type', 'category', '']
self.edge_file_json_properties = ['log_odds_ratio_95_ci']
self.edge_file_float_properties = ['auc_roc', 'log_odds_ratio', 'log_odds_ratio_95_ci_lower',
'log_odds_ratio_95_ci_upper', 'adjusted_p_value', ]
self.edge_file_int_properties = ['positive_patient_count', 'negative_patient_count', 'total_sample_size']

def get_latest_source_version(self) -> str:
version_file_url = f"{self.data_url}{self.version_file}"
r = requests.get(version_file_url)
version_yaml = yaml.full_load(r.text)
build_version = str(version_yaml['build'])
return build_version

def get_data(self) -> int:
data_puller = GetData()
source_url = f"{self.data_url}{self.data_file}"
data_puller.pull_via_http(source_url, self.data_path)
return True

def parse_data(self) -> dict:
"""
Parses the data file for graph nodes/edges and writes them to the KGX files.
:return: ret_val: record counts
"""
record_counter = 0
skipped_record_counter = 0

edge_file_path: str = os.path.join(self.data_path, self.edge_file_name)
with open(edge_file_path, 'r', newline='') as edge_file:

csv_reader = csv.DictReader(edge_file, quotechar='"')
for edge in csv_reader:
try:
for prop in self.edge_file_property_ignore_list:
edge.pop(prop, None)
for edge_prop in edge:
if edge_prop in self.edge_file_json_properties:
edge[edge_prop] = json.loads(edge[edge_prop])
elif edge_prop in self.edge_file_float_properties:
edge[edge_prop] = float(edge[edge_prop])
elif edge_prop in self.edge_file_int_properties:
edge[edge_prop] = int(edge[edge_prop])
self.output_file_writer.write_node(edge['subject'])
self.output_file_writer.write_node(edge['object'])
self.output_file_writer.write_normalized_edge(edge)
record_counter += 1
except (ValueError, KeyError) as e:
self.logger.error(str(e))
skipped_record_counter += 1

# load up the metadata
load_metadata: dict = {
'num_source_lines': record_counter,
'unusable_source_lines': skipped_record_counter}
return load_metadata

0 comments on commit ae386da

Please sign in to comment.