Zipstack · chandrasekharan-zipstack · Jul 10, 2024 · Jul 11, 2024 · Jul 11, 2024 · Jul 11, 2024
diff --git a/docker/docker-compose-dev-essentials.yaml b/docker/docker-compose-dev-essentials.yaml
@@ -30,6 +30,31 @@ services:
     labels:
       - traefik.enable=false
 
+  es:
+    image: "docker.elastic.co/elasticsearch/elasticsearch:8.14.1"
+    container_name: unstract-es
+    restart: unless-stopped
+    ports:
+      - "9200:9200"
+    environment:
+      node.name: es
+      discovery.seed_hosts: es
+      cluster.initial_master_nodes: es
+      cluster.name: unstract-metrics
+      bootstrap.memory_lock: "true"
+      xpack.security.enabled: "false"
+      ES_JAVA_OPTS: -Xms256m -Xmx256m
+    volumes:
+      - es_data:/usr/share/elasticsearch/data
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    labels:
+      - traefik.enable=false
+    profiles:
+      - unstract-metrics
+
   minio:
     image: 'minio/minio:latest'
     container_name: unstract-minio
@@ -123,8 +148,8 @@ services:
       - ./essentials.env
 
 volumes:
-  flipt_data:
   minio_data:
   postgres_data:
   qdrant_data:
   redis_data:
+  es_data:
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
@@ -51,29 +51,6 @@ services:
     volumes:
       - ./workflow_data:/data
 
-  # Celery Flower
-  celery-flower:
-    image: unstract/backend:${VERSION}
-    container_name: unstract-celery-flower
-    restart: unless-stopped
-    entrypoint: .venv/bin/celery
-    command: "-A backend flower --port=5555 --purge_offline_workers=5"
-    env_file:
-      - ../backend/.env
-    depends_on:
-      - execution-consumer
-      - redis
-    labels:
-      - traefik.enable=false
-    ports:
-      - "5555:5555"
-    environment:
-      - ENVIRONMENT=development
-    volumes:
-      - unstract_data:/data
-    profiles:
-      - optional
-
   # Celery Beat
   celery-beat:
     image: unstract/backend:${VERSION}
@@ -164,6 +141,44 @@ services:
     labels:
       - traefik.enable=false
 
+  # Web Admin tools to monitor services
+  # Celery Flower
+  celery-flower:
+    image: unstract/backend:${VERSION}
+    container_name: unstract-celery-flower
+    restart: unless-stopped
+    entrypoint: .venv/bin/celery
+    command: "-A backend flower --port=5555 --purge_offline_workers=5"
+    env_file:
+      - ../backend/.env
+    depends_on:
+      - execution-consumer
+      - redis
+    labels:
+      - traefik.enable=false
+    ports:
+      - "5555:5555"
+    environment:
+      - ENVIRONMENT=development
+    volumes:
+      - unstract_data:/data
+    profiles:
+      - optional
+
+  # Cerebro
+  cerebro:
+    image: lmenezes/cerebro:0.9.4
+    container_name: unstract-cerebro
+    restart: unless-stopped
+    ports:
+      - 9201:9000
+    depends_on:
+      - es
+    labels:
+      - "traefik.enable=false"
+    profiles:
+      - unstract-metrics
+
   document-service:
     profiles:
         - optional

diff --git a/unstract/metrics/README.md b/unstract/metrics/README.md
@@ -0,0 +1,22 @@
+# Unstract Metrics Aggregator
+
+Helps collect metrics from Unstract and its adapters and pushes them to Elasticsearch.
+
+Run the below services with the compose profile `unstract-metrics`.
+- elasticsearch
+- cerebro (UI for managing es instance)
+
+```shell
+VERSION=<version> docker compose -f docker-compose.yaml --profile unstract-metrics up -d 
+```
+
+
+## Using Cerebro: An Elasticsearch Web Admin tool
+
+- Run Cerebro with
+
+```shell
+VERSION=<version> docker compose -f docker-compose.yaml --profile unstract-metrics up -d cerebro
+```
+
+- Connect to `http://localhost:9201/` with the node address of `http://es:9200/`
diff --git a/unstract/metrics/pdm.lock b/unstract/metrics/pdm.lock
diff --git a/unstract/metrics/pyproject.toml b/unstract/metrics/pyproject.toml
@@ -0,0 +1,38 @@
+[build-system]
+requires = ["pdm-backend"]
+build-backend = "pdm.backend"
+
+[project]
+name = "unstract-metrics"
+version = "0.0.1"
+description = "Helps with collection of metrics from Unstract's adapters"
+authors = [{ name = "Zipstack Inc.", email = "[email protected]" }]
+dependencies = ["elasticsearch-dsl~=8.14.0"]
+# <3.11.1 due to resolution error from Unstract SDK
+requires-python = ">=3.9,<3.11.1"
+readme = "README.md"
+classifiers = ["Programming Language :: Python"]
+
+[tool.pdm.dev-dependencies]
+test = [
+    "pytest>=8.2.2",
+    "pytest-mock>=3.14.0",
+    "pytest-dotenv>=0.5.2",
+    "pytest-cov>=5.0.0",
+    "pytest-md-report>=0.6.2",
+]
+
+[tool.pdm.build]
+includes = ["src"]
+package-dir = "src"
+
+[tool.pytest.ini_options]
+env_files = ["tests/.env"]
+addopts = "-s"
+log_level = "INFO"
+log_cli = true
+
+[tool.pdm.scripts]
+test.cmd = "pytest -s -v"
+test.env_file = "tests/.env"
+test.help = "Runs pytests for Unstract Metrics"
diff --git a/unstract/metrics/src/unstract/metrics/__init__.py b/unstract/metrics/src/unstract/metrics/__init__.py
@@ -0,0 +1,16 @@
+import os
+
+from elasticsearch_dsl import connections
+
+from .metrics import MetricsAggregator  # noqa: F401
+
+ES_URL = os.getenv("ES_URL")
+ES_CLOUD_ID = os.getenv("ES_CLOUD_ID")
+ES_API_KEY = os.getenv("ES_API_KEY")
+if not ES_URL or (ES_CLOUD_ID and ES_API_KEY):
+    raise ValueError(
+        "Either env ES_URL or ES_CLOUD_ID and ES_API_KEY "
+        "is required to import unstract-metrics"
+    )
+
+connections.create_connection(hosts=[ES_URL], cloud_id=ES_CLOUD_ID, api_key=ES_API_KEY)
diff --git a/unstract/metrics/src/unstract/metrics/metrics.py b/unstract/metrics/src/unstract/metrics/metrics.py
@@ -0,0 +1,20 @@
+from typing import Optional
+
+from unstract.metrics.models.metrics import Metrics
+
+
+class MetricsAggregator:
+
+    def __init__(self, index_to_clone: Optional[str] = None) -> None:
+        # TODO: Create index with dynamic templates through a separate command
+        if not Metrics._index.exists():
+            Metrics.init(index=index_to_clone)
+
+    def add_metrics(self, metrics, index: str = "unstract-metrics-0"):
+        metrics_doc = Metrics(**metrics)
+        metrics_doc.save(index=index)
+
+    def query_metrics(self, run_id: str, index: str = "unstract-metrics-0"):
+        s = Metrics.search(index=index).query("match", run_id=run_id)
+        response = s.execute()
+        return response.to_dict()
diff --git a/unstract/metrics/src/unstract/metrics/models/log.py b/unstract/metrics/src/unstract/metrics/models/log.py
@@ -0,0 +1,7 @@
+from elasticsearch_dsl import Date, InnerDoc, Keyword, Text
+
+
+class Log(InnerDoc):
+    level = Keyword()
+    time = Date()
+    message = Text()
diff --git a/unstract/metrics/src/unstract/metrics/models/metrics.py b/unstract/metrics/src/unstract/metrics/models/metrics.py
@@ -0,0 +1,162 @@
+from elasticsearch_dsl import (
+    Date,
+    Document,
+    Float,
+    InnerDoc,
+    Integer,
+    Keyword,
+    Nested,
+    Object,
+    Text,
+)
+
+from .operation import Operation
+
+
+class LLMOperation(InnerDoc):
+    prompt = Text()
+    generated_response = Text()
+    adapter_metadata = Object(
+        properties={
+            "adapter_instance_id": Keyword(),
+            "type": Keyword(),
+            "name": Text(),
+            "model": Text(),
+            "max_retries": Integer(doc_values=False),
+            "max_output_tokens": Integer(doc_values=False),
+        }
+    )
+    metrics = Object(
+        properties={
+            "input_tokens": Integer(),
+            "output_tokens": Integer(),
+            "latency": Float(),
+            "input_tokens_cost": Float(),
+            "output_tokens_cost": Float(),
+            "total_cost": Float(),
+        }
+    )
+
+
+class VectorDBOperation(InnerDoc):
+    doc_id = Keyword()
+    retrieved_docs = Keyword(multi=True)
+    adapter_metadata = Object(
+        properties={
+            "adapter_instance_id": Keyword(),
+            "type": Keyword(),
+            "name": Text(),
+            "dimension": Integer(doc_values=False),
+        }
+    )
+    metrics = Object(
+        properties={"operation": Keyword(), "count": Integer(), "latency": Float()}
+    )
+
+
+class EmbeddingOperation(InnerDoc):
+    adapter_metadata = Object(
+        properties={
+            "adapter_instance_id": Keyword(),
+            "type": Keyword(),
+            "name": Text(),
+            "model": Text(),
+            "embed_batch_size": Integer(),
+        }
+    )
+    metrics = Object(
+        properties={"tokens": Integer(), "latency": Float(), "cost": Float()}
+    )
+
+
+class X2TextOperation(InnerDoc):
+    adapter_metadata = Object(
+        properties={
+            "adapter_instance_id": Keyword(),
+            "type": Keyword(),
+            "name": Text(),
+            "mode": Text(),
+        }
+    )
+    metrics = Object(
+        properties={
+            "pages_extracted": Integer(),
+            "latency": Float(),
+        }
+    )
+
+
+class Metrics(Document):
+    org_id = Keyword(required=True)
+    run_id = Keyword(required=True)
+    start_time = Date(required=True)
+    end_time = Date(required=True)
+    owner = Keyword()
+    agent = Keyword()  # TODO: Enum - WF | API | PS
+    agent_name = Text()
+    agent_id = Keyword()
+    status = Keyword()  # TODO: Make enum
+    api_key = Text()
+    operations = Nested(Operation)
+
+    class Index:
+        name = "unstract-metrics-*"
+        settings = {"number_of_replicas": 0, "number_of_shards": 1}
+
+    def save(
+        self,
+        using=None,
+        index=None,
+        validate=True,
+        skip_empty=True,
+        return_doc_meta=False,
+        **kwargs,
+    ):
+        self.meta.id = self.run_id
+        return super().save(
+            using, index, validate, skip_empty, return_doc_meta, **kwargs
+        )
+
+    @classmethod
+    def create_index(cls):
+        cls.init()
+        # Add dynamic templates for sub_process specific mappings
+        cls._index.put_mapping(
+            body={
+                "dynamic_templates": [
+                    {
+                        "llm_template": {
+                            "path_match": "operations.sub_process",
+                            "match_mapping_type": "string",
+                            "mapping": {
+                                "type": "nested",
+                                "properties": LLMOperation._doc_type.mapping.properties.to_dict(),  # noqa: E501
+                            },
+                            "match": "LLM",
+                        }
+                    },
+                    {
+                        "vectordb_template": {
+                            "path_match": "operations.sub_process",
+                            "match_mapping_type": "string",
+                            "mapping": {
+                                "type": "nested",
+                                "properties": VectorDBOperation._doc_type.mapping.properties.to_dict(),  # noqa: E501
+                            },
+                            "match": "VECTORDB",
+                        }
+                    },
+                    {
+                        "embedding_template": {
+                            "path_match": "operations.sub_process",
+                            "match_mapping_type": "string",
+                            "mapping": {
+                                "type": "nested",
+                                "properties": EmbeddingOperation._doc_type.mapping.properties.to_dict(),  # noqa: E501
+                            },
+                            "match": "EMBEDDING",
+                        }
+                    },
+                ]
+            }
+        )