Nolan/collections migration (#1681)

* 20,539% speed up in collections overview * Migration
2024-12-10 13:57:10 -08:00
parent a702d3ae85
commit 245b6221dd
5 changed files with 93 additions and 14 deletions
@@ -297,8 +297,9 @@ services:
      - R2R_POSTGRES_HOST=${R2R_POSTGRES_HOST:-postgres}
      - R2R_POSTGRES_PORT=${R2R_POSTGRES_PORT:-5432}
      - R2R_POSTGRES_DBNAME=${R2R_POSTGRES_DBNAME:-postgres}
-      - R2R_POSTGRES_MAX_CONNECTIONS=${R2R_POSTGRES_MAX_CONNECTIONS:-1024}
      - R2R_POSTGRES_PROJECT_NAME=${R2R_POSTGRES_PROJECT_NAME:-r2r_default}
+      - R2R_POSTGRES_MAX_CONNECTIONS=${R2R_POSTGRES_MAX_CONNECTIONS:-1024}
+      - R2R_POSTGRES_STATEMENT_CACHE_SIZE=${R2R_POSTGRES_STATEMENT_CACHE_SIZE:-100}

      # OpenAI
      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
@@ -63,8 +63,9 @@ services:
      - R2R_POSTGRES_HOST=${R2R_POSTGRES_HOST:-postgres}
      - R2R_POSTGRES_PORT=${R2R_POSTGRES_PORT:-5432}
      - R2R_POSTGRES_DBNAME=${R2R_POSTGRES_DBNAME:-postgres}
-      - R2R_POSTGRES_MAX_CONNECTIONS=${R2R_POSTGRES_MAX_CONNECTIONS:-1024}
      - R2R_POSTGRES_PROJECT_NAME=${R2R_POSTGRES_PROJECT_NAME:-r2r_default}
+      - R2R_POSTGRES_MAX_CONNECTIONS=${R2R_POSTGRES_MAX_CONNECTIONS:-1024}
+      - R2R_POSTGRES_STATEMENT_CACHE_SIZE=${R2R_POSTGRES_STATEMENT_CACHE_SIZE:-100}

      # OpenAI
      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
@@ -1,4 +1,3 @@
-import time
 import json
 import logging
 from typing import Any, Optional
@@ -292,7 +291,6 @@ class PostgresCollectionHandler(CollectionsHandler):
        filter_document_ids: Optional[list[UUID]] = None,
        filter_collection_ids: Optional[list[UUID]] = None,
    ) -> dict[str, list[CollectionResponse] | int]:
-        t0 = time.time()
        conditions = []
        params: list[Any] = []
        param_index = 1
@@ -349,24 +347,15 @@ class PostgresCollectionHandler(CollectionsHandler):
            params.append(limit)

        try:
-            t2 = time.time()
            results = await self.connection_manager.fetch_query(query, params)
-            t3 = time.time()
-            print(f"Time in fetch_query: {t3 - t2}")

            if not results:
                return {"results": [], "total_entries": 0}

            total_entries = results[0]["total_entries"] if results else 0

-            t4 = time.time()
            collections = [CollectionResponse(**row) for row in results]
-            t5 = time.time()

-            print(f"Time in creating CollectionResponse objects: {t5 - t4}")
-
-            t1 = time.time()
-            print(f"Total time in get_collections_overview: {t1 - t0}")
            return {"results": collections, "total_entries": total_entries}
        except Exception as e:
            raise HTTPException(
@@ -428,6 +417,15 @@ class PostgresCollectionHandler(CollectionsHandler):
                    message="Document is already assigned to the collection",
                )

+            update_collection_query = f"""
+                UPDATE {self._get_table_name('collections')}
+                SET document_count = document_count + 1
+                WHERE id = $1
+            """
+            await self.connection_manager.execute_query(
+                query=update_collection_query, params=[collection_id]
+            )
+
            return collection_id

        except R2RException:
@@ -387,11 +387,22 @@ class PostgresUserHandler(UserHandler):
        """
        result = await self.connection_manager.fetchrow_query(
            query, [collection_id, id]
-        )  # fetchrow instead of execute_query
+        )
        if not result:
            raise R2RException(
                status_code=400, message="User already in collection"
            )
+
+        update_collection_query = f"""
+            UPDATE {self._get_table_name('collections')}
+            SET user_count = user_count + 1
+            WHERE id = $1
+        """
+        await self.connection_manager.execute_query(
+            query=update_collection_query,
+            params=[collection_id],
+        )
+
        return True

    async def remove_user_from_collection(
@@ -0,0 +1,68 @@
+"""Add user and document count to collection
+
+Revision ID: c45a9cf6a8a4
+Revises: 
+Create Date: 2024-12-10 13:28:07.798167
+
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+import os
+
+# revision identifiers, used by Alembic.
+revision: str = "c45a9cf6a8a4"
+down_revision: Union[str, None] = None
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+project_name = os.getenv("R2R_PROJECT_NAME")
+if not project_name:
+    raise ValueError(
+        "Environment variable `R2R_PROJECT_NAME` must be provided migrate, it should be set equal to the value of `project_name` in your `r2r.toml`."
+    )
+
+
+def upgrade():
+    # Add the new columns with default value of 0
+    op.add_column(
+        "collections",
+        sa.Column(
+            "user_count", sa.Integer(), nullable=False, server_default="0"
+        ),
+        schema=project_name,
+    )
+    op.add_column(
+        "collections",
+        sa.Column(
+            "document_count", sa.Integer(), nullable=False, server_default="0"
+        ),
+        schema=project_name,
+    )
+
+    # Initialize the counts based on existing relationships
+    op.execute(
+        f"""
+        WITH collection_counts AS (
+            SELECT c.id,
+                   COUNT(DISTINCT u.id) as user_count,
+                   COUNT(DISTINCT d.id) as document_count
+            FROM {project_name}.collections c
+            LEFT JOIN {project_name}.users u ON c.id = ANY(u.collection_ids)
+            LEFT JOIN {project_name}.documents d ON c.id = ANY(d.collection_ids)
+            GROUP BY c.id
+        )
+        UPDATE {project_name}.collections c
+        SET user_count = COALESCE(cc.user_count, 0),
+            document_count = COALESCE(cc.document_count, 0)
+        FROM collection_counts cc
+        WHERE c.id = cc.id
+    """
+    )
+
+
+def downgrade():
+    op.drop_column("collections", "document_count", schema=project_name)
+    op.drop_column("collections", "user_count", schema=project_name)