Nolan/collections migration (#1681)

* 20,539% speed up in collections overview

* Migration
This commit is contained in:
Nolan Tremelling
2024-12-10 13:57:10 -08:00
committed by GitHub
parent a702d3ae85
commit 245b6221dd
5 changed files with 93 additions and 14 deletions
+2 -1
View File
@@ -297,8 +297,9 @@ services:
- R2R_POSTGRES_HOST=${R2R_POSTGRES_HOST:-postgres}
- R2R_POSTGRES_PORT=${R2R_POSTGRES_PORT:-5432}
- R2R_POSTGRES_DBNAME=${R2R_POSTGRES_DBNAME:-postgres}
- R2R_POSTGRES_MAX_CONNECTIONS=${R2R_POSTGRES_MAX_CONNECTIONS:-1024}
- R2R_POSTGRES_PROJECT_NAME=${R2R_POSTGRES_PROJECT_NAME:-r2r_default}
- R2R_POSTGRES_MAX_CONNECTIONS=${R2R_POSTGRES_MAX_CONNECTIONS:-1024}
- R2R_POSTGRES_STATEMENT_CACHE_SIZE=${R2R_POSTGRES_STATEMENT_CACHE_SIZE:-100}
# OpenAI
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
+2 -1
View File
@@ -63,8 +63,9 @@ services:
- R2R_POSTGRES_HOST=${R2R_POSTGRES_HOST:-postgres}
- R2R_POSTGRES_PORT=${R2R_POSTGRES_PORT:-5432}
- R2R_POSTGRES_DBNAME=${R2R_POSTGRES_DBNAME:-postgres}
- R2R_POSTGRES_MAX_CONNECTIONS=${R2R_POSTGRES_MAX_CONNECTIONS:-1024}
- R2R_POSTGRES_PROJECT_NAME=${R2R_POSTGRES_PROJECT_NAME:-r2r_default}
- R2R_POSTGRES_MAX_CONNECTIONS=${R2R_POSTGRES_MAX_CONNECTIONS:-1024}
- R2R_POSTGRES_STATEMENT_CACHE_SIZE=${R2R_POSTGRES_STATEMENT_CACHE_SIZE:-100}
# OpenAI
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
+9 -11
View File
@@ -1,4 +1,3 @@
import time
import json
import logging
from typing import Any, Optional
@@ -292,7 +291,6 @@ class PostgresCollectionHandler(CollectionsHandler):
filter_document_ids: Optional[list[UUID]] = None,
filter_collection_ids: Optional[list[UUID]] = None,
) -> dict[str, list[CollectionResponse] | int]:
t0 = time.time()
conditions = []
params: list[Any] = []
param_index = 1
@@ -349,24 +347,15 @@ class PostgresCollectionHandler(CollectionsHandler):
params.append(limit)
try:
t2 = time.time()
results = await self.connection_manager.fetch_query(query, params)
t3 = time.time()
print(f"Time in fetch_query: {t3 - t2}")
if not results:
return {"results": [], "total_entries": 0}
total_entries = results[0]["total_entries"] if results else 0
t4 = time.time()
collections = [CollectionResponse(**row) for row in results]
t5 = time.time()
print(f"Time in creating CollectionResponse objects: {t5 - t4}")
t1 = time.time()
print(f"Total time in get_collections_overview: {t1 - t0}")
return {"results": collections, "total_entries": total_entries}
except Exception as e:
raise HTTPException(
@@ -428,6 +417,15 @@ class PostgresCollectionHandler(CollectionsHandler):
message="Document is already assigned to the collection",
)
update_collection_query = f"""
UPDATE {self._get_table_name('collections')}
SET document_count = document_count + 1
WHERE id = $1
"""
await self.connection_manager.execute_query(
query=update_collection_query, params=[collection_id]
)
return collection_id
except R2RException:
+12 -1
View File
@@ -387,11 +387,22 @@ class PostgresUserHandler(UserHandler):
"""
result = await self.connection_manager.fetchrow_query(
query, [collection_id, id]
) # fetchrow instead of execute_query
)
if not result:
raise R2RException(
status_code=400, message="User already in collection"
)
update_collection_query = f"""
UPDATE {self._get_table_name('collections')}
SET user_count = user_count + 1
WHERE id = $1
"""
await self.connection_manager.execute_query(
query=update_collection_query,
params=[collection_id],
)
return True
async def remove_user_from_collection(
@@ -0,0 +1,68 @@
"""Add user and document count to collection
Revision ID: c45a9cf6a8a4
Revises:
Create Date: 2024-12-10 13:28:07.798167
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
import os
# revision identifiers, used by Alembic.
revision: str = "c45a9cf6a8a4"
down_revision: Union[str, None] = None
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
project_name = os.getenv("R2R_PROJECT_NAME")
if not project_name:
raise ValueError(
"Environment variable `R2R_PROJECT_NAME` must be provided migrate, it should be set equal to the value of `project_name` in your `r2r.toml`."
)
def upgrade():
# Add the new columns with default value of 0
op.add_column(
"collections",
sa.Column(
"user_count", sa.Integer(), nullable=False, server_default="0"
),
schema=project_name,
)
op.add_column(
"collections",
sa.Column(
"document_count", sa.Integer(), nullable=False, server_default="0"
),
schema=project_name,
)
# Initialize the counts based on existing relationships
op.execute(
f"""
WITH collection_counts AS (
SELECT c.id,
COUNT(DISTINCT u.id) as user_count,
COUNT(DISTINCT d.id) as document_count
FROM {project_name}.collections c
LEFT JOIN {project_name}.users u ON c.id = ANY(u.collection_ids)
LEFT JOIN {project_name}.documents d ON c.id = ANY(d.collection_ids)
GROUP BY c.id
)
UPDATE {project_name}.collections c
SET user_count = COALESCE(cc.user_count, 0),
document_count = COALESCE(cc.document_count, 0)
FROM collection_counts cc
WHERE c.id = cc.id
"""
)
def downgrade():
op.drop_column("collections", "document_count", schema=project_name)
op.drop_column("collections", "user_count", schema=project_name)