From 87a92babc6b6033018e58e7ff8356776b2d26747 Mon Sep 17 00:00:00 2001 From: Basit Mustafa Date: Fri, 20 Jun 2025 14:11:11 -0700 Subject: [PATCH] fix(web): batch findMany queries fixes #360 --- .../configuration/environment-variables.mdx | 1 + docs/snippets/schemas/v3/index.schema.mdx | 14 +++ packages/backend/src/constants.ts | 1 + packages/schemas/src/v3/index.schema.ts | 14 +++ packages/schemas/src/v3/index.type.ts | 4 + packages/web/src/actions.ts | 56 ++++++++---- packages/web/src/auth.ts | 2 +- packages/web/src/env.mjs | 3 + packages/web/src/features/search/searchApi.ts | 29 +++---- packages/web/src/lib/repoBatchQueries.ts | 87 +++++++++++++++++++ schemas/v3/index.json | 7 ++ 11 files changed, 182 insertions(+), 36 deletions(-) create mode 100644 packages/web/src/lib/repoBatchQueries.ts diff --git a/docs/docs/configuration/environment-variables.mdx b/docs/docs/configuration/environment-variables.mdx index 96f8e329..1799cb87 100644 --- a/docs/docs/configuration/environment-variables.mdx +++ b/docs/docs/configuration/environment-variables.mdx @@ -20,6 +20,7 @@ The following environment variables allow you to configure your Sourcebot deploy | `DATA_DIR` | `/data` |

The directory within the container to store all persistent data. Typically, this directory will be volume mapped such that data is persisted across container restarts (e.g., `docker run -v $(pwd):/data`)

| | `DATABASE_DATA_DIR` | `$DATA_CACHE_DIR/db` |

The data directory for the default Postgres database.

| | `DATABASE_URL` | `postgresql://postgres@ localhost:5432/sourcebot` |

Connection string of your Postgres database. By default, a Postgres database is automatically provisioned at startup within the container.

If you'd like to use a non-default schema, you can provide it as a parameter in the database url

| +| `DB_QUERY_BATCH_SIZE` | `500` |

The batch size for database queries to prevent memory issues with large datasets. This is a workaround for Prisma issue [#13864](https://github.com/prisma/prisma/issues/13864). Can also be configured via the `dbQueryBatchSize` setting in the configuration file. Valid range: 100-10000.

| | `EMAIL_FROM_ADDRESS` | `-` |

The email address that transactional emails will be sent from. See [this doc](/docs/configuration/transactional-emails) for more info.

| | `REDIS_DATA_DIR` | `$DATA_CACHE_DIR/redis` |

The data directory for the default Redis instance.

| | `REDIS_URL` | `redis://localhost:6379` |

Connection string of your Redis instance. By default, a Redis database is automatically provisioned at startup within the container.

| diff --git a/docs/snippets/schemas/v3/index.schema.mdx b/docs/snippets/schemas/v3/index.schema.mdx index 79bcda80..35749b4f 100644 --- a/docs/snippets/schemas/v3/index.schema.mdx +++ b/docs/snippets/schemas/v3/index.schema.mdx @@ -68,6 +68,13 @@ "type": "boolean", "description": "[Sourcebot EE] When enabled, allows unauthenticated users to access Sourcebot. Requires an enterprise license with an unlimited number of seats.", "default": false + }, + "dbQueryBatchSize": { + "type": "number", + "description": "The batch size for database queries to prevent memory issues with large datasets. This is a workaround for Prisma issue #13864. Defaults to 500.", + "minimum": 100, + "maximum": 10000, + "default": 500 } }, "additionalProperties": false @@ -182,6 +189,13 @@ "type": "boolean", "description": "[Sourcebot EE] When enabled, allows unauthenticated users to access Sourcebot. Requires an enterprise license with an unlimited number of seats.", "default": false + }, + "dbQueryBatchSize": { + "type": "number", + "description": "The batch size for database queries to prevent memory issues with large datasets. This is a workaround for Prisma issue #13864. Defaults to 500.", + "minimum": 100, + "maximum": 10000, + "default": 500 } }, "additionalProperties": false diff --git a/packages/backend/src/constants.ts b/packages/backend/src/constants.ts index 19bbc978..2c76aa0e 100644 --- a/packages/backend/src/constants.ts +++ b/packages/backend/src/constants.ts @@ -16,4 +16,5 @@ export const DEFAULT_SETTINGS: Settings = { repoGarbageCollectionGracePeriodMs: 10 * 1000, // 10 seconds repoIndexTimeoutMs: 1000 * 60 * 60 * 2, // 2 hours enablePublicAccess: false, + dbQueryBatchSize: 500, // Default batch size for database queries } diff --git a/packages/schemas/src/v3/index.schema.ts b/packages/schemas/src/v3/index.schema.ts index 35e7a4fe..a47424d6 100644 --- a/packages/schemas/src/v3/index.schema.ts +++ b/packages/schemas/src/v3/index.schema.ts @@ -67,6 +67,13 @@ const schema = { "type": "boolean", "description": "[Sourcebot EE] When enabled, allows unauthenticated users to access Sourcebot. Requires an enterprise license with an unlimited number of seats.", "default": false + }, + "dbQueryBatchSize": { + "type": "number", + "description": "The batch size for database queries to prevent memory issues with large datasets. This is a workaround for Prisma issue #13864. Defaults to 500.", + "minimum": 100, + "maximum": 10000, + "default": 500 } }, "additionalProperties": false @@ -181,6 +188,13 @@ const schema = { "type": "boolean", "description": "[Sourcebot EE] When enabled, allows unauthenticated users to access Sourcebot. Requires an enterprise license with an unlimited number of seats.", "default": false + }, + "dbQueryBatchSize": { + "type": "number", + "description": "The batch size for database queries to prevent memory issues with large datasets. This is a workaround for Prisma issue #13864. Defaults to 500.", + "minimum": 100, + "maximum": 10000, + "default": 500 } }, "additionalProperties": false diff --git a/packages/schemas/src/v3/index.type.ts b/packages/schemas/src/v3/index.type.ts index d239245f..dece6b0d 100644 --- a/packages/schemas/src/v3/index.type.ts +++ b/packages/schemas/src/v3/index.type.ts @@ -83,6 +83,10 @@ export interface Settings { * [Sourcebot EE] When enabled, allows unauthenticated users to access Sourcebot. Requires an enterprise license with an unlimited number of seats. */ enablePublicAccess?: boolean; + /** + * The batch size for database queries to prevent memory issues with large datasets. This is a workaround for Prisma issue #13864. Defaults to 500. + */ + dbQueryBatchSize?: number; } /** * Search context diff --git a/packages/web/src/actions.ts b/packages/web/src/actions.ts index e9adbc2a..99599a55 100644 --- a/packages/web/src/actions.ts +++ b/packages/web/src/actions.ts @@ -662,28 +662,46 @@ export const getConnectionInfo = async (connectionId: number, domain: string) => export const getRepos = async (domain: string, filter: { status?: RepoIndexingStatus[], connectionId?: number } = {}) => sew(() => withAuth((userId) => withOrgMembership(userId, domain, async ({ org }) => { - const repos = await prisma.repo.findMany({ - where: { - orgId: org.id, - ...(filter.status ? { - repoIndexingStatus: { in: filter.status } - } : {}), - ...(filter.connectionId ? { - connections: { - some: { - connectionId: filter.connectionId - } - } - } : {}), - }, - include: { + // Use batched query to prevent memory issues with large datasets. + // The batch size is configurable via DB_QUERY_BATCH_SIZE environment variable. + const whereClause = { + orgId: org.id, + ...(filter.status ? { + repoIndexingStatus: { in: filter.status } + } : {}), + ...(filter.connectionId ? { connections: { - include: { - connection: true, + some: { + connectionId: filter.connectionId } } - } - }); + } : {}), + }; + + // First get the total count + const totalCount = await prisma.repo.count({ where: whereClause }); + + const repos = []; + const batchSize = env.DB_QUERY_BATCH_SIZE; + const totalBatches = Math.ceil(totalCount / batchSize); + + // Execute queries in batches + for (let i = 0; i < totalBatches; i++) { + const skip = i * batchSize; + const batchResults = await prisma.repo.findMany({ + where: whereClause, + include: { + connections: { + include: { + connection: true, + } + } + }, + skip, + take: batchSize, + }); + repos.push(...batchResults); + } return repos.map((repo) => repositoryQuerySchema.parse({ codeHostType: repo.external_codeHostType, diff --git a/packages/web/src/auth.ts b/packages/web/src/auth.ts index a341192d..e1031a00 100644 --- a/packages/web/src/auth.ts +++ b/packages/web/src/auth.ts @@ -141,7 +141,7 @@ export const { handlers, signIn, signOut, auth } = NextAuth({ trustHost: true, events: { createUser: onCreateUser, - signIn: async ({ user, account }) => { + signIn: async ({ user, account: _account }) => { if (user.id) { await auditService.createAudit({ action: "user.signed_in", diff --git a/packages/web/src/env.mjs b/packages/web/src/env.mjs index 0a34c26f..dd4151f8 100644 --- a/packages/web/src/env.mjs +++ b/packages/web/src/env.mjs @@ -19,6 +19,9 @@ export const env = createEnv({ TOTAL_MAX_MATCH_COUNT: numberSchema.default(100000), ZOEKT_MAX_WALL_TIME_MS: numberSchema.default(10000), + // Database Query Performance + DB_QUERY_BATCH_SIZE: numberSchema.default(500), + // Auth AUTH_SECRET: z.string(), AUTH_URL: z.string().url(), diff --git a/packages/web/src/features/search/searchApi.ts b/packages/web/src/features/search/searchApi.ts index 8ca08640..bdecb005 100644 --- a/packages/web/src/features/search/searchApi.ts +++ b/packages/web/src/features/search/searchApi.ts @@ -12,6 +12,7 @@ import * as Sentry from "@sentry/nextjs"; import { sew, withAuth, withOrgMembership } from "@/actions"; import { base64Decode } from "@sourcebot/shared"; import { getAuditService } from "@/ee/features/audit/factory"; +import { batchedFindReposByIds, batchedFindReposByNames } from "@/lib/repoBatchQueries"; const auditService = getAuditService(); @@ -198,23 +199,19 @@ export const search = async ({ query, matches, contextLines, whole }: SearchRequ const repoIdentifiers = new Set(Result.Files?.map((file) => file.RepositoryID ?? file.Repository) ?? []); const repos = new Map(); - (await prisma.repo.findMany({ - where: { - id: { - in: Array.from(repoIdentifiers).filter((id) => typeof id === "number"), - }, - orgId: org.id, - } - })).forEach(repo => repos.set(repo.id, repo)); + // Batch query repos by ID to prevent memory issues with large datasets + const numericIds = Array.from(repoIdentifiers).filter((id) => typeof id === "number") as number[]; + if (numericIds.length > 0) { + const reposByIds = await batchedFindReposByIds(numericIds, org.id); + reposByIds.forEach((repo) => repos.set(repo.id, repo)); + } - (await prisma.repo.findMany({ - where: { - name: { - in: Array.from(repoIdentifiers).filter((id) => typeof id === "string"), - }, - orgId: org.id, - } - })).forEach(repo => repos.set(repo.name, repo)); + // Batch query repos by name to prevent memory issues with large datasets + const stringNames = Array.from(repoIdentifiers).filter((id) => typeof id === "string") as string[]; + if (stringNames.length > 0) { + const reposByNames = await batchedFindReposByNames(stringNames, org.id); + reposByNames.forEach((repo) => repos.set(repo.name, repo)); + } const files = Result.Files?.map((file) => { const fileNameChunks = file.ChunkMatches.filter((chunk) => chunk.FileName); diff --git a/packages/web/src/lib/repoBatchQueries.ts b/packages/web/src/lib/repoBatchQueries.ts new file mode 100644 index 00000000..5cf08347 --- /dev/null +++ b/packages/web/src/lib/repoBatchQueries.ts @@ -0,0 +1,87 @@ +/** + * Utility functions for batched Repo queries to handle large datasets efficiently + * and prevent memory issues like "Failed to convert rust String into napi string" + * + * This is a workaround for the Prisma issue: https://github.com/prisma/prisma/issues/13864 + * + * The batch size can be configured via the DB_QUERY_BATCH_SIZE environment variable + * or the dbQueryBatchSize setting in the configuration file. + */ + +import { Repo } from "@sourcebot/db"; +import { prisma } from "@/prisma"; +import { env } from "@/env.mjs"; + +const DEFAULT_BATCH_SIZE = env.DB_QUERY_BATCH_SIZE; + +/** + * Fetches repos by IDs in batches to prevent memory issues + * @param ids - Array of repo IDs to fetch + * @param orgId - Organization ID to filter by + * @param batchSize - Size of each batch (default: 500) + * @returns Array of repos + */ +export async function batchedFindReposByIds( + ids: number[], + orgId: number, + batchSize: number = DEFAULT_BATCH_SIZE +): Promise { + if (ids.length === 0) { + return []; + } + + const results: Repo[] = []; + const totalBatches = Math.ceil(ids.length / batchSize); + + for (let i = 0; i < totalBatches; i++) { + const startIndex = i * batchSize; + const endIndex = Math.min(startIndex + batchSize, ids.length); + const batchIds = ids.slice(startIndex, endIndex); + + const batchResults = await prisma.repo.findMany({ + where: { + id: { in: batchIds }, + orgId, + } + }); + results.push(...batchResults); + } + + return results; +} + +/** + * Fetches repos by names in batches to prevent memory issues + * @param names - Array of repo names to fetch + * @param orgId - Organization ID to filter by + * @param batchSize - Size of each batch (default: 500) + * @returns Array of repos + */ +export async function batchedFindReposByNames( + names: string[], + orgId: number, + batchSize: number = DEFAULT_BATCH_SIZE +): Promise { + if (names.length === 0) { + return []; + } + + const results: Repo[] = []; + const totalBatches = Math.ceil(names.length / batchSize); + + for (let i = 0; i < totalBatches; i++) { + const startIndex = i * batchSize; + const endIndex = Math.min(startIndex + batchSize, names.length); + const batchNames = names.slice(startIndex, endIndex); + + const batchResults = await prisma.repo.findMany({ + where: { + name: { in: batchNames }, + orgId, + } + }); + results.push(...batchResults); + } + + return results; +} \ No newline at end of file diff --git a/schemas/v3/index.json b/schemas/v3/index.json index 655a4466..d1799a61 100644 --- a/schemas/v3/index.json +++ b/schemas/v3/index.json @@ -66,6 +66,13 @@ "type": "boolean", "description": "[Sourcebot EE] When enabled, allows unauthenticated users to access Sourcebot. Requires an enterprise license with an unlimited number of seats.", "default": false + }, + "dbQueryBatchSize": { + "type": "number", + "description": "The batch size for database queries to prevent memory issues with large datasets. This is a workaround for Prisma issue #13864. Defaults to 500.", + "minimum": 100, + "maximum": 10000, + "default": 500 } }, "additionalProperties": false