Merge pull request #252 from RobokopU24/neo4j_5

Neo4j 5
RobokopU24 · Aug 13, 2024 · b124dfb · b124dfb
2 parents 74ea571 + 0a22609
commit b124dfb
Show file tree

Hide file tree

Showing 24 changed files with 520 additions and 236 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -13,27 +13,27 @@ jobs:
         runs-on: ubuntu-latest
         steps:
             - name: Check out the repo
-              uses: actions/checkout@v2
+              uses: actions/checkout@v4
             - name: Get the version
               id: get_version
               run: echo ::set-output name=VERSION::${GITHUB_REF/refs\/tags\//}
-            - name: Extract metadata (tags, labels) for Docker
-              id: meta
-              uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
-              with:
-                images:
-                    ghcr.io/${{ github.repository }}
             - name: Login to ghcr
-              uses: docker/login-action@v1
+              uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a
               with:
                   registry: ${{ env.REGISTRY }}
                   username: ${{ github.actor }}
                   password: ${{ secrets.GITHUB_TOKEN }}
+            - name: Extract metadata (tags, labels) for Docker
+              id: meta
+              uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
+              with:
+                images:
+                  ghcr.io/${{ github.repository }}
             - name: Push to GitHub Packages
-              uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
+              uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671
               with:
                   context: .
                   push: true
                   tags: ${{ steps.meta.outputs.tags }}
                   labels: ${{ steps.meta.outputs.labels }}
-                  build-args: VERSION=${{ steps.get_version.outputs.VERSION }}
+                  build-args: VERSION=${{ steps.get_version.outputs.VERSION }}
diff --git a/Common/biolink_constants.py b/Common/biolink_constants.py
@@ -48,10 +48,13 @@
 PREDICATE = 'predicate'
 PRIMARY_KNOWLEDGE_SOURCE = 'primary_knowledge_source'
 AGGREGATOR_KNOWLEDGE_SOURCES = 'aggregator_knowledge_source'
+SUPPORTING_DATA_SOURCE = 'supporting_data_source'
 P_VALUE = 'p_value'
 ADJUSTED_P_VALUE = 'adjusted_p_value'
 AGENT_TYPE = 'agent_type'
 KNOWLEDGE_LEVEL = 'knowledge_level'
+MAX_RESEARCH_PHASE = 'max_research_phase'
+HAS_SUPPORTING_STUDY_RESULT = 'has_supporting_study_result'
 
 # enums for knowledge level
 KNOWLEDGE_ASSERTION = 'knowledge_assertion'
@@ -137,6 +140,7 @@
     PREDICATE,
     PRIMARY_KNOWLEDGE_SOURCE,
     AGGREGATOR_KNOWLEDGE_SOURCES,
+    SUPPORTING_DATA_SOURCE,
     PUBLICATIONS,
     SYNONYMS,
     DESCRIPTION,
@@ -147,6 +151,8 @@
     FDA_APPROVAL_STATUS,
     KNOWLEDGE_LEVEL,
     MECHANISM_OF_ACTION,
+    MAX_RESEARCH_PHASE,
+    HAS_SUPPORTING_STUDY_RESULT,
     # qualifiers
     ANATOMICAL_CONTEXT_QUALIFIER,
     CAUSAL_MECHANISM_QUALIFIER,

diff --git a/Common/build_manager.py b/Common/build_manager.py
@@ -12,8 +12,8 @@
 from Common.load_manager import SourceDataManager
 from Common.kgx_file_merger import KGXFileMerger
 from Common.neo4j_tools import create_neo4j_dump
-from Common.kgxmodel import GraphSpec, SubGraphSource, DataSource, NormalizationScheme
-from Common.normalization import NORMALIZATION_CODE_VERSION
+from Common.kgxmodel import GraphSpec, SubGraphSource, DataSource
+from Common.normalization import NORMALIZATION_CODE_VERSION, NormalizationScheme
 from Common.metadata import Metadata, GraphMetadata, SourceMetadata
 from Common.supplementation import SequenceVariantSupplementation
 from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, PREDICATE, PUBLICATIONS
@@ -139,8 +139,7 @@ def build_dependencies(self, graph_spec: GraphSpec):
             subgraph_version = subgraph_source.version
             if self.check_for_existing_graph_dir(subgraph_id, subgraph_version):
                 # load previous metadata
-                graph_metadata = self.get_graph_metadata(subgraph_id, subgraph_version)
-                subgraph_source.graph_metadata = graph_metadata.metadata
+                subgraph_source.graph_metadata = self.get_graph_metadata(subgraph_id, subgraph_version)
             elif self.current_graph_versions[subgraph_id] == subgraph_version:
                 self.logger.warning(f'For graph {graph_spec.graph_id} subgraph dependency '
                                     f'{subgraph_id} version {subgraph_version} is not ready. Building now...')

diff --git a/Common/data_sources.py b/Common/data_sources.py
@@ -4,6 +4,7 @@
 BINDING_DB = 'BINDING-DB'
 CAM_KP = 'CAM-KP'
 CHEBI_PROPERTIES = 'CHEBIProps'
+CLINICAL_TRIALS_KP = 'ClinicalTrialsKP'
 CORD19 = 'Cord19'
 CTD = 'CTD'
 DRUG_CENTRAL = 'DrugCentral'
@@ -51,6 +52,7 @@
     BINDING_DB: ("parsers.BINDING.src.loadBINDINGDB", "BINDINGDBLoader"),
     CAM_KP: ("parsers.camkp.src.loadCAMKP", "CAMKPLoader"),
     CHEBI_PROPERTIES: ("parsers.chebi.src.loadChebiProperties", "ChebiPropertiesLoader"),
+    CLINICAL_TRIALS_KP: ("parsers.clinicaltrials.src.loadCTKP", "CTKPLoader"),
     CORD19: ("parsers.cord19.src.loadCord19", "Cord19Loader"),
     CTD: ("parsers.CTD.src.loadCTD", "CTDLoader"),
     DRUG_CENTRAL: ("parsers.drugcentral.src.loaddrugcentral", "DrugCentralLoader"),

diff --git a/Common/kgx_file_converter.py b/Common/kgx_file_converter.py
@@ -94,7 +94,7 @@ def __determine_properties_and_types(file_path: str, required_properties: dict):
         for key, value in entity.items():
             if value is None:
                 property_type_counts[key]["None"] += 1
-                if key in required_properties:
+                if key in required_properties and key != "name":
                     print(f'WARNING: Required property ({key}) was None: {entity.items()}')
                     raise Exception(
                         f'None found as a value for a required property (property: {key}) in line {entity.items()}')
@@ -134,7 +134,7 @@ def __determine_properties_and_types(file_path: str, required_properties: dict):
         # if 'None' in prop_types:
             # print(f'WARNING: None found as a value for property {prop}')
 
-        if prop in required_properties and (num_prop_types > 1):
+        if prop in required_properties and (num_prop_types > 1) and prop != "name":
             # TODO this should just enforce that required properties are the correct type,
             #  instead of trying to establish the type
             raise Exception(f'Required property {prop} had multiple conflicting types: {type_counts.items()}')
@@ -192,7 +192,10 @@ def __convert_to_csv(input_file: str,
         for item in quick_jsonl_file_iterator(input_file):
             for key in list(item.keys()):
                 if item[key] is None:
-                    del item[key]
+                    if key == "name":
+                        item["name"] = item["id"]
+                    else:
+                        del item[key]
                 else:
                     prop_type = properties[key]
                     # convert lists into strings with an array delimiter

diff --git a/Common/kgx_file_merger.py b/Common/kgx_file_merger.py
@@ -85,8 +85,10 @@ def merge_primary_sources(self,
         needs_on_disk_merge = False
         for graph_source in graph_sources:
             if isinstance(graph_source, SubGraphSource):
-                needs_on_disk_merge = True
-                break
+                for source_id in graph_source.graph_metadata.get_source_ids():
+                    if source_id in RESOURCE_HOGS:
+                        needs_on_disk_merge = True
+                        break
             elif graph_source.id in RESOURCE_HOGS:
                 needs_on_disk_merge = True
                 break

diff --git a/Common/kgx_file_normalizer.py b/Common/kgx_file_normalizer.py
@@ -5,25 +5,13 @@
 from Common.biolink_utils import BiolinkInformationResources, INFORES_STATUS_INVALID, INFORES_STATUS_DEPRECATED
 from Common.biolink_constants import SEQUENCE_VARIANT, PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, \
     PUBLICATIONS, OBJECT_ID, SUBJECT_ID, PREDICATE, SUBCLASS_OF
-from Common.normalization import NodeNormalizer, EdgeNormalizer, EdgeNormalizationResult
+from Common.normalization import NormalizationScheme, NodeNormalizer, EdgeNormalizer, EdgeNormalizationResult, \
+    NormalizationFailedError
 from Common.utils import LoggingUtil, chunk_iterator
 from Common.kgx_file_writer import KGXFileWriter
-from Common.kgxmodel import NormalizationScheme
 from Common.merging import MemoryGraphMerger, DiskGraphMerger
 
 
-class NormalizationBrokenError(Exception):
-    def __init__(self, error_message: str, actual_error: Exception=None):
-        self.error_message = error_message
-        self.actual_error = actual_error
-
-
-class NormalizationFailedError(Exception):
-    def __init__(self, error_message: str, actual_error: Exception=None):
-        self.error_message = error_message
-        self.actual_error = actual_error
-
-
 EDGE_PROPERTIES_THAT_SHOULD_BE_SETS = {AGGREGATOR_KNOWLEDGE_SOURCES, PUBLICATIONS}
 NODE_NORMALIZATION_BATCH_SIZE = 1_000_000
 EDGE_NORMALIZATION_BATCH_SIZE = 1_000_000
@@ -350,6 +338,7 @@ def normalize_edge_file(self):
                             # this could happen due to rare cases of normalization splits where one node normalizes to many
                             if edge_count > 1:
                                 edge_splits += edge_count - 1
+
                     graph_merger.merge_edges(normalized_edges)
                     self.logger.info(f'Processed {number_of_source_edges} edges so far...')
 

diff --git a/Common/kgxmodel.py b/Common/kgxmodel.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 from Common.biolink_constants import NAMED_THING
-from Common.normalization import NORMALIZATION_CODE_VERSION
+from Common.metadata import GraphMetadata
+from Common.normalization import NormalizationScheme
 
 class kgxnode:
     def __init__(self,
@@ -33,31 +34,6 @@ def __init__(self,
             self.properties = {}
 
 
-@dataclass
-class NormalizationScheme:
-    node_normalization_version: str = 'latest'
-    edge_normalization_version: str = 'latest'
-    normalization_code_version: str = NORMALIZATION_CODE_VERSION
-    strict: bool = True
-    conflation: bool = False
-
-    def get_composite_normalization_version(self):
-        composite_normalization_version = f'{self.node_normalization_version}_' \
-                                f'{self.edge_normalization_version}_{self.normalization_code_version}'
-        if self.conflation:
-            composite_normalization_version += '_conflated'
-        if self.strict:
-            composite_normalization_version += '_strict'
-        return composite_normalization_version
-
-    def get_metadata_representation(self):
-        return {'node_normalization_version': self.node_normalization_version,
-                'edge_normalization_version': self.edge_normalization_version,
-                'normalization_code_version': self.normalization_code_version,
-                'conflation': self.conflation,
-                'strict': self.strict}
-
-
 @dataclass
 class GraphSpec:
     graph_id: str
@@ -91,13 +67,13 @@ class GraphSource:
 
 @dataclass
 class SubGraphSource(GraphSource):
-    graph_metadata: dict = None
+    graph_metadata: GraphMetadata = None
 
     def get_metadata_representation(self):
         return {'graph_id': self.id,
                 'release_version': self.version,
                 'merge_strategy:': self.merge_strategy,
-                'graph_metadata': self.graph_metadata}
+                'graph_metadata': self.graph_metadata.metadata if self.graph_metadata else None}
 
 
 @dataclass

diff --git a/Common/load_manager.py b/Common/load_manager.py
@@ -5,9 +5,8 @@
 
 from Common.data_sources import SourceDataLoaderClassFactory, RESOURCE_HOGS, get_available_data_sources
 from Common.utils import LoggingUtil, GetDataPullError
-from Common.kgx_file_normalizer import KGXFileNormalizer, NormalizationBrokenError, NormalizationFailedError
-from Common.kgxmodel import NormalizationScheme
-from Common.normalization import NodeNormalizer, EdgeNormalizer
+from Common.kgx_file_normalizer import KGXFileNormalizer
+from Common.normalization import NormalizationScheme, NodeNormalizer, EdgeNormalizer, NormalizationFailedError
 from Common.metadata import SourceMetadata
 from Common.loader_interface import SourceDataBrokenError, SourceDataFailedError
 from Common.supplementation import SequenceVariantSupplementation, SupplementationFailedError
@@ -356,17 +355,6 @@ def normalize_source(self,
                                                           normalization_status=SourceMetadata.STABLE,
                                                           normalization_info=normalization_info)
             return True
-        except NormalizationBrokenError as broken_error:
-            error_message = f"{source_id} NormalizationBrokenError: {broken_error.error_message}"
-            if broken_error.actual_error:
-                error_message += f" - {broken_error.actual_error}"
-            self.logger.error(error_message)
-            source_metadata.update_normalization_metadata(parsing_version,
-                                                          composite_normalization_version,
-                                                          normalization_status=SourceMetadata.BROKEN,
-                                                          normalization_error=error_message,
-                                                          normalization_time=current_time)
-            return False
         except NormalizationFailedError as failed_error:
             error_message = f"{source_id} NormalizationFailedError: {failed_error.error_message}"
             if failed_error.actual_error:

diff --git a/Common/merging.py b/Common/merging.py
@@ -19,17 +19,34 @@ def edge_key_function(edge):
 
 
 def entity_merging_function(entity_1, entity_2, properties_that_are_sets):
-    for key, value in entity_2.items():
-        # TODO - make sure this is the behavior we want -
-        # for properties that are lists append the values
-        # otherwise keep the first one
-        if key in entity_1:
-            if isinstance(value, list):
-                entity_1[key].extend(value)
-                if key in properties_that_are_sets:
-                    entity_1[key] = list(set(entity_1[key]))
+    # for every property of entity 2
+    for key, entity_2_value in entity_2.items():
+        # if entity 1 also has the property and entity_2_value is not null/empty:
+        # concatenate values if one is a list, otherwise ignore the property from entity 2
+        if (key in entity_1) and entity_2_value:
+            entity_1_value = entity_1[key]
+            entity_1_is_list = isinstance(entity_1_value, list)
+            entity_2_is_list = isinstance(entity_2_value, list)
+            if entity_1_is_list and entity_2_is_list:
+                # if they're both lists just combine them
+                entity_1_value.extend(entity_2_value)
+            elif entity_1_is_list:
+                # if 1 is a list and 2 isn't, append the value of 2 to the list from 1
+                entity_1_value.append(entity_2_value)
+            elif entity_2_is_list:
+                if entity_1_value:
+                    # if 2 is a list and 1 has a value, add the value of 1 to the list from 2
+                    entity_1[key] = [entity_1_value] + entity_2_value
+                else:
+                    # if 2 is a list and 1 doesn't have a value, just use the list from 2
+                    entity_1[key] = entity_2_value
+            # else:
+            # if neither is a list, do nothing (keep the value from 1)
+            if (entity_1_is_list or entity_2_is_list) and (key in properties_that_are_sets):
+                entity_1[key] = list(set(entity_1[key]))
         else:
-            entity_1[key] = value
+            # if entity 1 doesn't have the property, add the property from entity 2
+            entity_1[key] = entity_2_value
     return entity_1
 
 

diff --git a/Common/metadata.py b/Common/metadata.py
@@ -3,7 +3,7 @@
 import json
 from xxhash import xxh64_hexdigest
 
-from Common.kgxmodel import NormalizationScheme
+from Common.normalization import NormalizationScheme
 
 
 class Metadata:
@@ -122,6 +122,9 @@ def get_build_status(self):
     def get_graph_version(self):
         return self.metadata['graph_version']
 
+    def get_source_ids(self):
+        return [source['source_id'] for source in self.metadata['sources']]
+
 
 class SourceMetadata(Metadata):