From 6816ee70bee1db9ed33d4a8e7cff9edb1107a7e0 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Mon, 16 Dec 2024 11:07:45 -0500 Subject: [PATCH 01/19] check for change in chunkAlgoHash --- .../src/contentStore/updateEmbeddedContent.ts | 47 +++++++++++++------ 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts index 62c288baf..f48713c42 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts @@ -18,6 +18,21 @@ export interface EmbedConcurrencyOptions { createChunks?: number; } +const chunkAlgoHashes = new Map(); + +const getHashForFunc = (f: ChunkFunc, o?: Partial): string => { + const data = JSON.stringify(o ?? {}) + f.toString(); + const existingHash = chunkAlgoHashes.get(data); + if (existingHash) { + return existingHash; + } + const hash = createHash("sha256"); + hash.update(data); + const digest = hash.digest("hex"); + chunkAlgoHashes.set(data, digest); + return digest; +}; + /** (Re-)embeddedContent the pages in the page store that have changed since the given date and stores the embeddedContent in the embeddedContent store. @@ -48,6 +63,23 @@ export const updateEmbeddedContent = async ({ sourceNames ? ` in sources: ${sourceNames.join(", ")}` : "" }` ); + // find changed chunkingHashes + const allPages = await pageStore.loadPages({sources: sourceNames}); + for (const page of allPages) { + const existingContent = await embeddedContentStore.loadEmbeddedContent({ + page, + }); + const chunkAlgoHash = getHashForFunc(chunkPage, chunkOptions); + if (existingContent[0].chunkAlgoHash !== chunkAlgoHash) { + logger.info( + `Chunking algorithm has changed for ${page.sourceName}: ${page.url}. Deleting existing embedded content to force an update.` + ); + await embeddedContentStore.deleteEmbeddedContent({ + page, + }); + } + } + await PromisePool.withConcurrency(concurrencyOptions?.processPages ?? 1) .for(changedPages) .process(async (page, index, pool) => { @@ -73,21 +105,6 @@ export const updateEmbeddedContent = async ({ }); }; -const chunkAlgoHashes = new Map(); - -const getHashForFunc = (f: ChunkFunc, o?: Partial): string => { - const data = JSON.stringify(o ?? {}) + f.toString(); - const existingHash = chunkAlgoHashes.get(data); - if (existingHash) { - return existingHash; - } - const hash = createHash("sha256"); - hash.update(data); - const digest = hash.digest("hex"); - chunkAlgoHashes.set(data, digest); - return digest; -}; - export const updateEmbeddedContentForPage = async ({ page, store, From d688f5c815210b722417c2a23dac51c098485c67 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Mon, 16 Dec 2024 13:09:48 -0500 Subject: [PATCH 02/19] add pages with changed chunking to arg for next step --- .../src/contentStore/updateEmbeddedContent.ts | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts index f48713c42..9017f9aec 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts @@ -63,22 +63,17 @@ export const updateEmbeddedContent = async ({ sourceNames ? ` in sources: ${sourceNames.join(", ")}` : "" }` ); - // find changed chunkingHashes + // find pages with changed chunkingHashes const allPages = await pageStore.loadPages({sources: sourceNames}); - for (const page of allPages) { + const pagesWithChangedChunking = allPages.filter(async (page) => { const existingContent = await embeddedContentStore.loadEmbeddedContent({ page, }); const chunkAlgoHash = getHashForFunc(chunkPage, chunkOptions); - if (existingContent[0].chunkAlgoHash !== chunkAlgoHash) { - logger.info( - `Chunking algorithm has changed for ${page.sourceName}: ${page.url}. Deleting existing embedded content to force an update.` - ); - await embeddedContentStore.deleteEmbeddedContent({ - page, - }); - } - } + return existingContent[0].chunkAlgoHash !== chunkAlgoHash + }); + + changedPages.push(...pagesWithChangedChunking); await PromisePool.withConcurrency(concurrencyOptions?.processPages ?? 1) .for(changedPages) From 47c3b922d3835c54c785b9b1863a9c0ebbac5082 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Tue, 17 Dec 2024 17:25:08 -0500 Subject: [PATCH 03/19] pulling pages to be updated based on embedded content --- .../src/contentStore/EmbeddedContent.ts | 12 +++- .../MongoDbEmbeddedContentStore.ts | 68 ++++++++++++++++++- .../updateEmbeddedContent.test.ts | 3 + .../src/contentStore/updateEmbeddedContent.ts | 39 +++++++---- 4 files changed, 105 insertions(+), 17 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts index 75ff32660..1c5c572eb 100644 --- a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts @@ -1,4 +1,4 @@ -import { Page } from "."; +import { Page, PersistedPage } from "."; import { VectorStore } from "../VectorStore"; /** @@ -84,6 +84,16 @@ export type EmbeddedContentStore = VectorStore & { */ loadEmbeddedContent(args: { page: Page }): Promise; + /** + Get the Pages of the embedded content that meets the filter requirements. + */ + getPagesFromEmbeddedContent(args: { + dataSources?: string[]; + updated: Date; + chunkAlgoHash: string; + inverseChunkAlgoHash?: boolean; + }): Promise; + /** Delete all embedded content for the given page and/or data sources. */ diff --git a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts index bcd3a1470..fa6c7c028 100644 --- a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts +++ b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts @@ -1,4 +1,4 @@ -import { pageIdentity } from "."; +import { pageIdentity, PersistedPage } from "."; import { DatabaseConnection } from "../DatabaseConnection"; import { EmbeddedContent, EmbeddedContentStore } from "./EmbeddedContent"; import { FindNearestNeighborsOptions, WithScore } from "../VectorStore"; @@ -95,6 +95,72 @@ export function makeMongoDbEmbeddedContentStore({ return await embeddedContentCollection.find(pageIdentity(page)).toArray(); }, + async getPagesFromEmbeddedContent({ + dataSources, + updated, + chunkAlgoHash, + inverseChunkAlgoHash = false, + }): Promise { + const pipeline = [ + { + $match: { + ...(dataSources ? { sourceName: { $in: dataSources } } : undefined), + updated: { + $gte: updated, + }, + chunkAlgoHash: inverseChunkAlgoHash + ? { $ne: chunkAlgoHash } + : chunkAlgoHash, + }, + }, + { + $lookup: { + from: "pages", + let: { + url: "$url", + sourceName: "$sourceName", + }, + pipeline: [ + { + $match: { + $expr: { + $and: [ + { + $eq: ["$url", "$$url"], + }, + { + $eq: ["$sourceName", "$$sourceName"], + }, + ], + }, + }, + }, + ], + as: "pages", + }, + }, + { + $project: { + _id: 0, + pages: 1, + }, + }, + { + $unwind: { + path: "$pages", + }, + }, + { + $replaceRoot: { + newRoot: "$pages", + }, + }, + ]; + return await embeddedContentCollection + .aggregate(pipeline) + .toArray(); + }, + async deleteEmbeddedContent({ page, dataSources, diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts index cc5252378..c75233147 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts @@ -26,6 +26,9 @@ export const makeMockEmbeddedContentStore = (): EmbeddedContentStore => { async updateEmbeddedContent({ embeddedContent, page }) { content.set(page.url, [...embeddedContent]); }, + async getPagesFromEmbeddedContent(args) { + return []; + }, metadata: { embeddingName: "test", }, diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts index 9017f9aec..cb300004f 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts @@ -34,8 +34,8 @@ const getHashForFunc = (f: ChunkFunc, o?: Partial): string => { }; /** - (Re-)embeddedContent the pages in the page store that have changed since the given date - and stores the embeddedContent in the embeddedContent store. + (Re-)embeddedContent the pages in the page store that have changed (copy change, or chunking algothrim change) + since the given date and stores the embeddedContent in the embeddedContent store. */ export const updateEmbeddedContent = async ({ since, @@ -54,27 +54,36 @@ export const updateEmbeddedContent = async ({ sourceNames?: string[]; concurrencyOptions?: EmbedConcurrencyOptions; }): Promise => { - const changedPages = await pageStore.loadPages({ + // find all pages with changed content + const pagesWithChangedContent = await pageStore.loadPages({ updated: since, sources: sourceNames, }); logger.info( - `Found ${changedPages.length} changed pages since ${since}${ + `Found ${pagesWithChangedContent.length} changed pages since ${since}${ sourceNames ? ` in sources: ${sourceNames.join(", ")}` : "" }` ); - // find pages with changed chunkingHashes - const allPages = await pageStore.loadPages({sources: sourceNames}); - const pagesWithChangedChunking = allPages.filter(async (page) => { - const existingContent = await embeddedContentStore.loadEmbeddedContent({ - page, + // find all pages with embeddings created using chunkingHashes different from the current chunkingHash + const chunkAlgoHash = getHashForFunc(chunkPage, chunkOptions); + const pagesWithChangedChunking = + await embeddedContentStore.getPagesFromEmbeddedContent({ + dataSources: sourceNames, + updated: since, + chunkAlgoHash, + inverseChunkAlgoHash: true, }); - const chunkAlgoHash = getHashForFunc(chunkPage, chunkOptions); - return existingContent[0].chunkAlgoHash !== chunkAlgoHash - }); - - changedPages.push(...pagesWithChangedChunking); - + logger.info( + `Found ${ + pagesWithChangedChunking.length + } pages with changed chunkingHashes since ${since}${ + sourceNames ? ` in sources: ${sourceNames.join(", ")}` : "" + }` + ); + const changedPages = [ + ...pagesWithChangedContent, + ...pagesWithChangedChunking, + ]; await PromisePool.withConcurrency(concurrencyOptions?.processPages ?? 1) .for(changedPages) .process(async (page, index, pool) => { From 11bac3a50368bb95e194a401ff51540d38ad1432 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Fri, 20 Dec 2024 15:19:08 -0500 Subject: [PATCH 04/19] update embeddings if chunk algo changes, regardless of page changes --- .../src/contentStore/EmbeddedContent.ts | 1 - .../MongoDbEmbeddedContentStore.ts | 4 - .../updateEmbeddedContent.test.ts | 151 +++++++++++++++++- .../src/contentStore/updateEmbeddedContent.ts | 1 - 4 files changed, 150 insertions(+), 7 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts index 1c5c572eb..4f41c440a 100644 --- a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts @@ -89,7 +89,6 @@ export type EmbeddedContentStore = VectorStore & { */ getPagesFromEmbeddedContent(args: { dataSources?: string[]; - updated: Date; chunkAlgoHash: string; inverseChunkAlgoHash?: boolean; }): Promise; diff --git a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts index fa6c7c028..10e4e1c3c 100644 --- a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts +++ b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts @@ -97,7 +97,6 @@ export function makeMongoDbEmbeddedContentStore({ async getPagesFromEmbeddedContent({ dataSources, - updated, chunkAlgoHash, inverseChunkAlgoHash = false, }): Promise { @@ -105,9 +104,6 @@ export function makeMongoDbEmbeddedContentStore({ { $match: { ...(dataSources ? { sourceName: { $in: dataSources } } : undefined), - updated: { - $gte: updated, - }, chunkAlgoHash: inverseChunkAlgoHash ? { $ne: chunkAlgoHash } : chunkAlgoHash, diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts index c75233147..3514bb742 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts @@ -2,12 +2,16 @@ import { updateEmbeddedContent, updateEmbeddedContentForPage, } from "./updateEmbeddedContent"; -import { persistPages } from "."; +import { makeMongoDbEmbeddedContentStore, makeMongoDbPageStore, MongoDbEmbeddedContentStore, MongoDbPageStore, PageStore, persistPages, updatePages } from "."; import { makeMockPageStore } from "../test/MockPageStore"; import * as chunkPageModule from "../chunk/chunkPage"; import { EmbeddedContentStore, EmbeddedContent } from "./EmbeddedContent"; import { Embedder } from "../embed"; import { Page, PersistedPage } from "."; +import { strict as assert } from "assert"; +import { MongoMemoryReplSet } from "mongodb-memory-server"; +import { DataSource } from "../dataSources"; +import { MongoClient } from "mongodb"; export const makeMockEmbeddedContentStore = (): EmbeddedContentStore => { const content: Map = new Map(); @@ -279,3 +283,148 @@ describe("updateEmbeddedContent", () => { }); }); }); + + +describe("updateEmbeddedContent", () => { + let mongod: MongoMemoryReplSet | undefined; + let pageStore: MongoDbPageStore; + let embedStore: MongoDbEmbeddedContentStore; + let uri: string; + let databaseName: string; + let mongoClient: MongoClient; + let page1Embedding: EmbeddedContent[], page2Embedding: EmbeddedContent[] + let pages: PersistedPage[] = []; + + const embedder = { + async embed() { + return { embedding: [1, 2, 3] }; + }, + }; + const mockDataSources: DataSource[] = [ + { + name: "source1", + fetchPages: async () => [ + { + url: "test1.com", + format: "html", + sourceName: "source1", + body: "hello source 1", + }, + ], + }, + { + name: "source2", + fetchPages: async () => [ + { + url: "test2.com", + format: "html", + sourceName: "source2", + body: "hello source 1", + }, + ], + }, + ]; + const mockDataSourceNames = mockDataSources.map( + (dataSource) => dataSource.name + ); + + beforeEach(async () => { + databaseName = "test-all-command"; + mongod = await MongoMemoryReplSet.create(); + uri = mongod.getUri(); + mongoClient = await MongoClient.connect(mongod.getUri(), {}); + embedStore = makeMongoDbEmbeddedContentStore({ + connectionUri: uri, + databaseName, + searchIndex: { embeddingName: "test-embedding" }, + }); + pageStore = makeMongoDbPageStore({ + connectionUri: uri, + databaseName, + }); + // create pages and verify that they have been created + await updatePages({ sources: mockDataSources, pageStore }); + pages = await pageStore.loadPages(); + assert(pages.length == 2); + // create embeddings for the pages and verify that they have been created + await updateEmbeddedContent({ + since: new Date(0), + embeddedContentStore: embedStore, + pageStore, + sourceNames: mockDataSourceNames, + embedder, + }); + page1Embedding = await embedStore.loadEmbeddedContent({ + page: pages[0], + }); + page2Embedding = await embedStore.loadEmbeddedContent({ + page: pages[1], + }); + assert(page1Embedding.length); + assert(page2Embedding.length); + }); + + it("updates embedded content for pages that have been updated after the 'since' date provided", async () => { + // Modify dates of pages and embedded content for testing + const sinceDate = new Date("2024-01-01") + const beforeSinceDate = new Date("2023-01-01") + const afterSinceDate = new Date("2025-01-01") + // set pages[0] to be last updated before sinceDate (should not be modified) + await mongoClient.db(databaseName).collection('pages').updateOne({...pages[0]}, { $set: { updated: beforeSinceDate } }); + await mongoClient.db(databaseName).collection('embedded_content').updateOne({sourceName: mockDataSourceNames[0]}, { $set: { updated: beforeSinceDate } }); + // set pages[1] to be last updated after sinceDate (should be re-chunked) + await mongoClient.db(databaseName).collection('pages').updateOne({...pages[1]}, { $set: { updated: afterSinceDate } }); + await mongoClient.db(databaseName).collection('embedded_content').updateOne({sourceName: mockDataSourceNames[1]}, { $set: { updated: afterSinceDate } }); + const originalPage1Embedding = await embedStore.loadEmbeddedContent({ + page: pages[0], + }); + const originalPage2Embedding = await embedStore.loadEmbeddedContent({ + page: pages[1], + }); + await updateEmbeddedContent({ + since: sinceDate, + embeddedContentStore: embedStore, + pageStore, + sourceNames: mockDataSourceNames, + embedder, + }); + const updatedPage1Embedding = await embedStore.loadEmbeddedContent({ + page: pages[0], + }); + const updatedPage2Embedding = await embedStore.loadEmbeddedContent({ + page: pages[1], + }); + assert(updatedPage1Embedding.length); + assert(updatedPage2Embedding.length); + expect(updatedPage1Embedding[0].updated.getTime()).toBe(originalPage1Embedding[0].updated.getTime()); + expect(updatedPage2Embedding[0].updated.getTime()).not.toBe(originalPage2Embedding[0].updated.getTime()); + }); + it("updates embedded content when page has not changed, but chunk algo has, ignoring since date", async () => { + // change the chunking algo for the second page, but not the first + await updateEmbeddedContent({ + since: new Date(), + embeddedContentStore: embedStore, + pageStore, + sourceNames: [mockDataSourceNames[0]], + embedder, + }); + await updateEmbeddedContent({ + since: new Date(), + embeddedContentStore: embedStore, + pageStore, + sourceNames: [mockDataSourceNames[1]], + embedder, + chunkOptions: { chunkOverlap: 2 }, + }); + const updatedPage1Embedding = await embedStore.loadEmbeddedContent({ + page: pages[0], + }); + const updatedPage2Embedding = await embedStore.loadEmbeddedContent({ + page: pages[1], + }); + assert(updatedPage1Embedding.length); + assert(updatedPage2Embedding.length); + expect(updatedPage1Embedding[0].chunkAlgoHash).toBe(page1Embedding[0].chunkAlgoHash); + expect(updatedPage2Embedding[0].chunkAlgoHash).not.toBe(page2Embedding[0].chunkAlgoHash); + }); +}); \ No newline at end of file diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts index cb300004f..f41dbab03 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts @@ -69,7 +69,6 @@ export const updateEmbeddedContent = async ({ const pagesWithChangedChunking = await embeddedContentStore.getPagesFromEmbeddedContent({ dataSources: sourceNames, - updated: since, chunkAlgoHash, inverseChunkAlgoHash: true, }); From 54259295a9bacedd88c2896b35a3c69681db7db9 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Fri, 20 Dec 2024 15:50:18 -0500 Subject: [PATCH 05/19] update mockEmbeddedContentStore to have newly added method --- .../src/findContent/BoostOnAtlasSearchFilter.test.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages/mongodb-rag-core/src/findContent/BoostOnAtlasSearchFilter.test.ts b/packages/mongodb-rag-core/src/findContent/BoostOnAtlasSearchFilter.test.ts index 96e5f520f..42a4354d0 100644 --- a/packages/mongodb-rag-core/src/findContent/BoostOnAtlasSearchFilter.test.ts +++ b/packages/mongodb-rag-core/src/findContent/BoostOnAtlasSearchFilter.test.ts @@ -69,6 +69,9 @@ describe("makeBoostOnAtlasSearchFilter()", () => { async findNearestNeighbors() { return mockBoostedResults; }, + async getPagesFromEmbeddedContent(args) { + return []; + }, metadata: { embeddingName, }, From e358ca09d14b0057d0ad6629fb66b888689564c4 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Fri, 20 Dec 2024 16:02:25 -0500 Subject: [PATCH 06/19] teardown in afterEach --- .../src/contentStore/updateEmbeddedContent.test.ts | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts index 3514bb742..514551782 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts @@ -364,6 +364,15 @@ describe("updateEmbeddedContent", () => { assert(page2Embedding.length); }); + afterEach(async () => { + await pageStore.drop(); + await pageStore.close(); + await embedStore.drop(); + await embedStore.close(); + await mongoClient.close(); + await mongod?.stop(); + }); + it("updates embedded content for pages that have been updated after the 'since' date provided", async () => { // Modify dates of pages and embedded content for testing const sinceDate = new Date("2024-01-01") From 57091b2951ca157966ffb3dc06527f7ae55fe361 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Fri, 20 Dec 2024 16:18:41 -0500 Subject: [PATCH 07/19] chunkAlgoHash creation moved up in function chain, value passed down --- .../src/contentStore/updateEmbeddedContent.test.ts | 1 + .../src/contentStore/updateEmbeddedContent.ts | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts index 514551782..c66fd0d39 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts @@ -214,6 +214,7 @@ describe("updateEmbeddedContent", () => { store: embeddedContentStore, page, concurrencyOptions: { createChunks: 2 }, + chunkAlgoHash: "testchunkalgohash", }); const embeddedContent = await embeddedContentStore.loadEmbeddedContent({ diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts index f41dbab03..186f90b8a 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts @@ -103,6 +103,7 @@ export const updateEmbeddedContent = async ({ chunkOptions, embedder, concurrencyOptions, + chunkAlgoHash, }); } }); @@ -114,12 +115,14 @@ export const updateEmbeddedContentForPage = async ({ embedder, chunkOptions, concurrencyOptions, + chunkAlgoHash, }: { page: PersistedPage; store: EmbeddedContentStore; embedder: Embedder; chunkOptions?: Partial; concurrencyOptions?: EmbedConcurrencyOptions; + chunkAlgoHash: string; }): Promise => { const contentChunks = await chunkPage(page, chunkOptions); @@ -142,7 +145,6 @@ export const updateEmbeddedContentForPage = async ({ const existingContent = await store.loadEmbeddedContent({ page, }); - const chunkAlgoHash = getHashForFunc(chunkPage, chunkOptions); if ( existingContent.length && existingContent[0].updated > page.updated && From 42ab655b9ee250e21cb519174f08d81a777a6e82 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Fri, 20 Dec 2024 16:32:47 -0500 Subject: [PATCH 08/19] move chunkAlgoHashes out of global scope --- .../src/contentStore/updateEmbeddedContent.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts index 186f90b8a..ec13d1385 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts @@ -18,9 +18,8 @@ export interface EmbedConcurrencyOptions { createChunks?: number; } -const chunkAlgoHashes = new Map(); -const getHashForFunc = (f: ChunkFunc, o?: Partial): string => { +const getHashForFunc = (chunkAlgoHashes: Map, f: ChunkFunc, o?: Partial): string => { const data = JSON.stringify(o ?? {}) + f.toString(); const existingHash = chunkAlgoHashes.get(data); if (existingHash) { @@ -65,7 +64,8 @@ export const updateEmbeddedContent = async ({ }` ); // find all pages with embeddings created using chunkingHashes different from the current chunkingHash - const chunkAlgoHash = getHashForFunc(chunkPage, chunkOptions); + const chunkAlgoHashes = new Map(); + const chunkAlgoHash = getHashForFunc(chunkAlgoHashes, chunkPage, chunkOptions); const pagesWithChangedChunking = await embeddedContentStore.getPagesFromEmbeddedContent({ dataSources: sourceNames, From 5872f56dc4629a6df73e4060799a972705002606 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Wed, 15 Jan 2025 17:22:59 -0500 Subject: [PATCH 09/19] change implementation - query for data sources that use an old chunk algo hash --- .../src/contentStore/EmbeddedContent.ts | 14 ++-- .../MongoDbEmbeddedContentStore.ts | 77 ++++--------------- .../updateEmbeddedContent.test.ts | 39 +++++++++- .../src/contentStore/updateEmbeddedContent.ts | 20 +++-- .../BoostOnAtlasSearchFilter.test.ts | 2 +- 5 files changed, 69 insertions(+), 83 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts index 4f41c440a..dee172ad9 100644 --- a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts @@ -84,15 +84,6 @@ export type EmbeddedContentStore = VectorStore & { */ loadEmbeddedContent(args: { page: Page }): Promise; - /** - Get the Pages of the embedded content that meets the filter requirements. - */ - getPagesFromEmbeddedContent(args: { - dataSources?: string[]; - chunkAlgoHash: string; - inverseChunkAlgoHash?: boolean; - }): Promise; - /** Delete all embedded content for the given page and/or data sources. */ @@ -123,4 +114,9 @@ export type EmbeddedContentStore = VectorStore & { Initialize the store. */ init?: () => Promise; + + /** + Get the data sources that match the given query. + */ + getDataSources(matchQuery: any): Promise }; diff --git a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts index 10e4e1c3c..ade71e191 100644 --- a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts +++ b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts @@ -95,68 +95,6 @@ export function makeMongoDbEmbeddedContentStore({ return await embeddedContentCollection.find(pageIdentity(page)).toArray(); }, - async getPagesFromEmbeddedContent({ - dataSources, - chunkAlgoHash, - inverseChunkAlgoHash = false, - }): Promise { - const pipeline = [ - { - $match: { - ...(dataSources ? { sourceName: { $in: dataSources } } : undefined), - chunkAlgoHash: inverseChunkAlgoHash - ? { $ne: chunkAlgoHash } - : chunkAlgoHash, - }, - }, - { - $lookup: { - from: "pages", - let: { - url: "$url", - sourceName: "$sourceName", - }, - pipeline: [ - { - $match: { - $expr: { - $and: [ - { - $eq: ["$url", "$$url"], - }, - { - $eq: ["$sourceName", "$$sourceName"], - }, - ], - }, - }, - }, - ], - as: "pages", - }, - }, - { - $project: { - _id: 0, - pages: 1, - }, - }, - { - $unwind: { - path: "$pages", - }, - }, - { - $replaceRoot: { - newRoot: "$pages", - }, - }, - ]; - return await embeddedContentCollection - .aggregate(pipeline) - .toArray(); - }, - async deleteEmbeddedContent({ page, dataSources, @@ -294,5 +232,20 @@ export function makeMongoDbEmbeddedContentStore({ } } }, + + async getDataSources(matchQuery: any): Promise { + const result = await embeddedContentCollection.aggregate([ + { $match: matchQuery }, + { + $group: { + _id: null, + uniqueSources: { $addToSet: "$sourceName" } + } + }, + { $project: { _id: 0, uniqueSources: 1 } } + ]).toArray(); + const uniqueSources = result.length > 0 ? result[0].uniqueSources : []; + return uniqueSources; + } }; } diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts index c66fd0d39..44274d6ec 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts @@ -30,12 +30,12 @@ export const makeMockEmbeddedContentStore = (): EmbeddedContentStore => { async updateEmbeddedContent({ embeddedContent, page }) { content.set(page.url, [...embeddedContent]); }, - async getPagesFromEmbeddedContent(args) { - return []; - }, metadata: { embeddingName: "test", }, + async getDataSources(matchQuery: any): Promise { + return []; + }, }; }; @@ -285,7 +285,7 @@ describe("updateEmbeddedContent", () => { }); }); - +// These tests use "mongodb-memory-server", not mockEmbeddedContentStore describe("updateEmbeddedContent", () => { let mongod: MongoMemoryReplSet | undefined; let pageStore: MongoDbPageStore; @@ -330,6 +330,7 @@ describe("updateEmbeddedContent", () => { ); beforeEach(async () => { + // setup mongo client, page store, and embedded content store databaseName = "test-all-command"; mongod = await MongoMemoryReplSet.create(); uri = mongod.getUri(); @@ -437,4 +438,34 @@ describe("updateEmbeddedContent", () => { expect(updatedPage1Embedding[0].chunkAlgoHash).toBe(page1Embedding[0].chunkAlgoHash); expect(updatedPage2Embedding[0].chunkAlgoHash).not.toBe(page2Embedding[0].chunkAlgoHash); }); + it("use a new chunking algo on data sources, some of which have pages that have been updated", async () => { + // SETUP: Modify dates of pages and embedded content for this test case + const sinceDate = new Date("2024-01-01") + const afterSinceDate = new Date("2025-01-01") + await mongoClient.db(databaseName).collection('pages').updateOne({...pages[0]}, { $set: { updated: afterSinceDate }}); + await mongoClient.db(databaseName).collection('embedded_content').updateOne({sourceName: mockDataSourceNames[0]}, { $set: { updated: afterSinceDate } }); + const originalPage1Embedding = await embedStore.loadEmbeddedContent({ + page: pages[0], + }); + // END SETUP + await updateEmbeddedContent({ + since: sinceDate, + embeddedContentStore: embedStore, + pageStore, + sourceNames: mockDataSourceNames, + embedder, + chunkOptions: { chunkOverlap: 2 }, + }); + const updatedPage1Embedding = await embedStore.loadEmbeddedContent({ + page: pages[0], + }); + const updatedPage2Embedding = await embedStore.loadEmbeddedContent({ + page: pages[1], + }); + assert(updatedPage1Embedding.length); + assert(updatedPage2Embedding.length); + // both pages should be updated + expect(updatedPage1Embedding[0].chunkAlgoHash).not.toBe(originalPage1Embedding[0].chunkAlgoHash); + expect(updatedPage2Embedding[0].chunkAlgoHash).not.toBe(page2Embedding[0].chunkAlgoHash); + }); }); \ No newline at end of file diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts index ec13d1385..311b0884c 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts @@ -66,16 +66,22 @@ export const updateEmbeddedContent = async ({ // find all pages with embeddings created using chunkingHashes different from the current chunkingHash const chunkAlgoHashes = new Map(); const chunkAlgoHash = getHashForFunc(chunkAlgoHashes, chunkPage, chunkOptions); - const pagesWithChangedChunking = - await embeddedContentStore.getPagesFromEmbeddedContent({ - dataSources: sourceNames, - chunkAlgoHash, - inverseChunkAlgoHash: true, - }); + const dataSourcesWithChangedChunking = await embeddedContentStore.getDataSources({ + chunkAlgoHash: {$ne: chunkAlgoHash}, + // run on specific source names if specified, run on all if not + ...(sourceNames ? {"sourceName": { + "$in": sourceNames + }}: undefined) + }) + // find all pages with changed chunking, ignoring since date because + // we want to re-chunk all pages with the new chunkAlgoHash, even if there were no other changes to the page + const pagesWithChangedChunking = await pageStore.loadPages({ + sources: dataSourcesWithChangedChunking + }) logger.info( `Found ${ pagesWithChangedChunking.length - } pages with changed chunkingHashes since ${since}${ + } pages with changed chunkingHashes ${ sourceNames ? ` in sources: ${sourceNames.join(", ")}` : "" }` ); diff --git a/packages/mongodb-rag-core/src/findContent/BoostOnAtlasSearchFilter.test.ts b/packages/mongodb-rag-core/src/findContent/BoostOnAtlasSearchFilter.test.ts index 42a4354d0..24454bc6a 100644 --- a/packages/mongodb-rag-core/src/findContent/BoostOnAtlasSearchFilter.test.ts +++ b/packages/mongodb-rag-core/src/findContent/BoostOnAtlasSearchFilter.test.ts @@ -69,7 +69,7 @@ describe("makeBoostOnAtlasSearchFilter()", () => { async findNearestNeighbors() { return mockBoostedResults; }, - async getPagesFromEmbeddedContent(args) { + async getDataSources(matchQuery) { return []; }, metadata: { From 963870a4da85cf2cfb8a22dd3e4b30f13be523d2 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Wed, 15 Jan 2025 17:25:23 -0500 Subject: [PATCH 10/19] cleanup and lint --- .../src/contentStore/EmbeddedContent.ts | 2 +- .../MongoDbEmbeddedContentStore.ts | 24 ++--- .../updateEmbeddedContent.test.ts | 91 ++++++++++++++----- .../src/contentStore/updateEmbeddedContent.ts | 36 +++++--- 4 files changed, 107 insertions(+), 46 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts index dee172ad9..71b4b733c 100644 --- a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts @@ -118,5 +118,5 @@ export type EmbeddedContentStore = VectorStore & { /** Get the data sources that match the given query. */ - getDataSources(matchQuery: any): Promise + getDataSources(matchQuery: any): Promise; }; diff --git a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts index ade71e191..8670c1fd3 100644 --- a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts +++ b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts @@ -234,18 +234,20 @@ export function makeMongoDbEmbeddedContentStore({ }, async getDataSources(matchQuery: any): Promise { - const result = await embeddedContentCollection.aggregate([ - { $match: matchQuery }, - { - $group: { - _id: null, - uniqueSources: { $addToSet: "$sourceName" } - } - }, - { $project: { _id: 0, uniqueSources: 1 } } - ]).toArray(); + const result = await embeddedContentCollection + .aggregate([ + { $match: matchQuery }, + { + $group: { + _id: null, + uniqueSources: { $addToSet: "$sourceName" }, + }, + }, + { $project: { _id: 0, uniqueSources: 1 } }, + ]) + .toArray(); const uniqueSources = result.length > 0 ? result[0].uniqueSources : []; return uniqueSources; - } + }, }; } diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts index 44274d6ec..fc0ff4224 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts @@ -2,7 +2,15 @@ import { updateEmbeddedContent, updateEmbeddedContentForPage, } from "./updateEmbeddedContent"; -import { makeMongoDbEmbeddedContentStore, makeMongoDbPageStore, MongoDbEmbeddedContentStore, MongoDbPageStore, PageStore, persistPages, updatePages } from "."; +import { + makeMongoDbEmbeddedContentStore, + makeMongoDbPageStore, + MongoDbEmbeddedContentStore, + MongoDbPageStore, + PageStore, + persistPages, + updatePages, +} from "."; import { makeMockPageStore } from "../test/MockPageStore"; import * as chunkPageModule from "../chunk/chunkPage"; import { EmbeddedContentStore, EmbeddedContent } from "./EmbeddedContent"; @@ -285,7 +293,7 @@ describe("updateEmbeddedContent", () => { }); }); -// These tests use "mongodb-memory-server", not mockEmbeddedContentStore +// These tests use "mongodb-memory-server", not mockEmbeddedContentStore describe("updateEmbeddedContent", () => { let mongod: MongoMemoryReplSet | undefined; let pageStore: MongoDbPageStore; @@ -293,9 +301,9 @@ describe("updateEmbeddedContent", () => { let uri: string; let databaseName: string; let mongoClient: MongoClient; - let page1Embedding: EmbeddedContent[], page2Embedding: EmbeddedContent[] + let page1Embedding: EmbeddedContent[], page2Embedding: EmbeddedContent[]; let pages: PersistedPage[] = []; - + const embedder = { async embed() { return { embedding: [1, 2, 3] }; @@ -377,15 +385,33 @@ describe("updateEmbeddedContent", () => { it("updates embedded content for pages that have been updated after the 'since' date provided", async () => { // Modify dates of pages and embedded content for testing - const sinceDate = new Date("2024-01-01") - const beforeSinceDate = new Date("2023-01-01") - const afterSinceDate = new Date("2025-01-01") + const sinceDate = new Date("2024-01-01"); + const beforeSinceDate = new Date("2023-01-01"); + const afterSinceDate = new Date("2025-01-01"); // set pages[0] to be last updated before sinceDate (should not be modified) - await mongoClient.db(databaseName).collection('pages').updateOne({...pages[0]}, { $set: { updated: beforeSinceDate } }); - await mongoClient.db(databaseName).collection('embedded_content').updateOne({sourceName: mockDataSourceNames[0]}, { $set: { updated: beforeSinceDate } }); + await mongoClient + .db(databaseName) + .collection("pages") + .updateOne({ ...pages[0] }, { $set: { updated: beforeSinceDate } }); + await mongoClient + .db(databaseName) + .collection("embedded_content") + .updateOne( + { sourceName: mockDataSourceNames[0] }, + { $set: { updated: beforeSinceDate } } + ); // set pages[1] to be last updated after sinceDate (should be re-chunked) - await mongoClient.db(databaseName).collection('pages').updateOne({...pages[1]}, { $set: { updated: afterSinceDate } }); - await mongoClient.db(databaseName).collection('embedded_content').updateOne({sourceName: mockDataSourceNames[1]}, { $set: { updated: afterSinceDate } }); + await mongoClient + .db(databaseName) + .collection("pages") + .updateOne({ ...pages[1] }, { $set: { updated: afterSinceDate } }); + await mongoClient + .db(databaseName) + .collection("embedded_content") + .updateOne( + { sourceName: mockDataSourceNames[1] }, + { $set: { updated: afterSinceDate } } + ); const originalPage1Embedding = await embedStore.loadEmbeddedContent({ page: pages[0], }); @@ -407,8 +433,12 @@ describe("updateEmbeddedContent", () => { }); assert(updatedPage1Embedding.length); assert(updatedPage2Embedding.length); - expect(updatedPage1Embedding[0].updated.getTime()).toBe(originalPage1Embedding[0].updated.getTime()); - expect(updatedPage2Embedding[0].updated.getTime()).not.toBe(originalPage2Embedding[0].updated.getTime()); + expect(updatedPage1Embedding[0].updated.getTime()).toBe( + originalPage1Embedding[0].updated.getTime() + ); + expect(updatedPage2Embedding[0].updated.getTime()).not.toBe( + originalPage2Embedding[0].updated.getTime() + ); }); it("updates embedded content when page has not changed, but chunk algo has, ignoring since date", async () => { // change the chunking algo for the second page, but not the first @@ -435,15 +465,28 @@ describe("updateEmbeddedContent", () => { }); assert(updatedPage1Embedding.length); assert(updatedPage2Embedding.length); - expect(updatedPage1Embedding[0].chunkAlgoHash).toBe(page1Embedding[0].chunkAlgoHash); - expect(updatedPage2Embedding[0].chunkAlgoHash).not.toBe(page2Embedding[0].chunkAlgoHash); + expect(updatedPage1Embedding[0].chunkAlgoHash).toBe( + page1Embedding[0].chunkAlgoHash + ); + expect(updatedPage2Embedding[0].chunkAlgoHash).not.toBe( + page2Embedding[0].chunkAlgoHash + ); }); it("use a new chunking algo on data sources, some of which have pages that have been updated", async () => { // SETUP: Modify dates of pages and embedded content for this test case - const sinceDate = new Date("2024-01-01") - const afterSinceDate = new Date("2025-01-01") - await mongoClient.db(databaseName).collection('pages').updateOne({...pages[0]}, { $set: { updated: afterSinceDate }}); - await mongoClient.db(databaseName).collection('embedded_content').updateOne({sourceName: mockDataSourceNames[0]}, { $set: { updated: afterSinceDate } }); + const sinceDate = new Date("2024-01-01"); + const afterSinceDate = new Date("2025-01-01"); + await mongoClient + .db(databaseName) + .collection("pages") + .updateOne({ ...pages[0] }, { $set: { updated: afterSinceDate } }); + await mongoClient + .db(databaseName) + .collection("embedded_content") + .updateOne( + { sourceName: mockDataSourceNames[0] }, + { $set: { updated: afterSinceDate } } + ); const originalPage1Embedding = await embedStore.loadEmbeddedContent({ page: pages[0], }); @@ -465,7 +508,11 @@ describe("updateEmbeddedContent", () => { assert(updatedPage1Embedding.length); assert(updatedPage2Embedding.length); // both pages should be updated - expect(updatedPage1Embedding[0].chunkAlgoHash).not.toBe(originalPage1Embedding[0].chunkAlgoHash); - expect(updatedPage2Embedding[0].chunkAlgoHash).not.toBe(page2Embedding[0].chunkAlgoHash); + expect(updatedPage1Embedding[0].chunkAlgoHash).not.toBe( + originalPage1Embedding[0].chunkAlgoHash + ); + expect(updatedPage2Embedding[0].chunkAlgoHash).not.toBe( + page2Embedding[0].chunkAlgoHash + ); }); -}); \ No newline at end of file +}); diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts index 311b0884c..be1b2bfa5 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts @@ -18,8 +18,11 @@ export interface EmbedConcurrencyOptions { createChunks?: number; } - -const getHashForFunc = (chunkAlgoHashes: Map, f: ChunkFunc, o?: Partial): string => { +const getHashForFunc = ( + chunkAlgoHashes: Map, + f: ChunkFunc, + o?: Partial +): string => { const data = JSON.stringify(o ?? {}) + f.toString(); const existingHash = chunkAlgoHashes.get(data); if (existingHash) { @@ -65,19 +68,28 @@ export const updateEmbeddedContent = async ({ ); // find all pages with embeddings created using chunkingHashes different from the current chunkingHash const chunkAlgoHashes = new Map(); - const chunkAlgoHash = getHashForFunc(chunkAlgoHashes, chunkPage, chunkOptions); - const dataSourcesWithChangedChunking = await embeddedContentStore.getDataSources({ - chunkAlgoHash: {$ne: chunkAlgoHash}, - // run on specific source names if specified, run on all if not - ...(sourceNames ? {"sourceName": { - "$in": sourceNames - }}: undefined) - }) + const chunkAlgoHash = getHashForFunc( + chunkAlgoHashes, + chunkPage, + chunkOptions + ); + const dataSourcesWithChangedChunking = + await embeddedContentStore.getDataSources({ + chunkAlgoHash: { $ne: chunkAlgoHash }, + // run on specific source names if specified, run on all if not + ...(sourceNames + ? { + sourceName: { + $in: sourceNames, + }, + } + : undefined), + }); // find all pages with changed chunking, ignoring since date because // we want to re-chunk all pages with the new chunkAlgoHash, even if there were no other changes to the page const pagesWithChangedChunking = await pageStore.loadPages({ - sources: dataSourcesWithChangedChunking - }) + sources: dataSourcesWithChangedChunking, + }); logger.info( `Found ${ pagesWithChangedChunking.length From 81bee15b401e11f84645fe632998f8cdad0b4eef Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Tue, 21 Jan 2025 11:57:20 -0500 Subject: [PATCH 11/19] fix tests --- .../src/contentStore/updateEmbeddedContent.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts index fc0ff4224..34a9654e2 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts @@ -375,8 +375,8 @@ describe("updateEmbeddedContent", () => { }); afterEach(async () => { - await pageStore.drop(); - await pageStore.close(); + await pageStore?.drop(); + await pageStore?.close(); await embedStore.drop(); await embedStore.close(); await mongoClient.close(); From 4fdf716d3956ddaae7fce6a4194ac051be81f535 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Tue, 21 Jan 2025 17:00:15 -0500 Subject: [PATCH 12/19] move mongo memory server creation from beforeEach to beforeAll --- .../src/contentStore/updateEmbeddedContent.test.ts | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts index 34a9654e2..f35535353 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts @@ -336,13 +336,14 @@ describe("updateEmbeddedContent", () => { const mockDataSourceNames = mockDataSources.map( (dataSource) => dataSource.name ); - - beforeEach(async () => { - // setup mongo client, page store, and embedded content store - databaseName = "test-all-command"; + beforeAll(async () => { mongod = await MongoMemoryReplSet.create(); uri = mongod.getUri(); mongoClient = await MongoClient.connect(mongod.getUri(), {}); + }); + beforeEach(async () => { + // setup mongo client, page store, and embedded content store + databaseName = "test-all-command"; embedStore = makeMongoDbEmbeddedContentStore({ connectionUri: uri, databaseName, @@ -376,8 +377,10 @@ describe("updateEmbeddedContent", () => { afterEach(async () => { await pageStore?.drop(); - await pageStore?.close(); await embedStore.drop(); + }); + afterAll(async () => { + await pageStore?.close(); await embedStore.close(); await mongoClient.close(); await mongod?.stop(); From 8ed50ec172d932b12665f7954715d48b1270b82d Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Wed, 22 Jan 2025 14:48:34 -0500 Subject: [PATCH 13/19] refactor --- .../src/contentStore/EmbeddedContent.ts | 10 ++++++- .../MongoDbEmbeddedContentStore.ts | 27 ++++++++++++++++--- .../updateEmbeddedContent.test.ts | 7 +++-- .../src/contentStore/updateEmbeddedContent.ts | 14 ++++------ 4 files changed, 41 insertions(+), 17 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts index 71b4b733c..fdece3c0f 100644 --- a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts @@ -75,6 +75,14 @@ export type DeleteEmbeddedContentArgs = { inverseDataSources?: boolean; }; +export interface GetSourcesMatchParams { + sourceNames?: string[]; + chunkAlgoHash: { + hashValue: string; + operation: "equals" | "notEquals" + } +} + /** Data store of the embedded content. */ @@ -118,5 +126,5 @@ export type EmbeddedContentStore = VectorStore & { /** Get the data sources that match the given query. */ - getDataSources(matchQuery: any): Promise; + getDataSources(matchQuery: GetSourcesMatchParams): Promise; }; diff --git a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts index 8670c1fd3..4e5d5eea2 100644 --- a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts +++ b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts @@ -1,6 +1,6 @@ import { pageIdentity, PersistedPage } from "."; import { DatabaseConnection } from "../DatabaseConnection"; -import { EmbeddedContent, EmbeddedContentStore } from "./EmbeddedContent"; +import { EmbeddedContent, EmbeddedContentStore, GetSourcesMatchParams } from "./EmbeddedContent"; import { FindNearestNeighborsOptions, WithScore } from "../VectorStore"; import { MakeMongoDbDatabaseConnectionParams, @@ -58,6 +58,27 @@ export type MongoDbEmbeddedContentStore = EmbeddedContentStore & init(): Promise; }; +function makeMatchQuery ({sourceNames, chunkAlgoHash}: GetSourcesMatchParams) { + let operator = ''; + if(chunkAlgoHash.operation === 'equals'){ + operator = '$eq' + } + if(chunkAlgoHash.operation === 'notEquals'){ + operator = '$ne' + } + return ({ + chunkAlgoHash: { [operator]: chunkAlgoHash.hashValue }, + // run on specific source names if specified, run on all if not + ...(sourceNames + ? { + sourceName: { + $in: sourceNames, + }, + } + : undefined), + }); +} + export function makeMongoDbEmbeddedContentStore({ connectionUri, databaseName, @@ -233,10 +254,10 @@ export function makeMongoDbEmbeddedContentStore({ } }, - async getDataSources(matchQuery: any): Promise { + async getDataSources(matchQuery: GetSourcesMatchParams): Promise { const result = await embeddedContentCollection .aggregate([ - { $match: matchQuery }, + { $match: makeMatchQuery(matchQuery) }, { $group: { _id: null, diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts index f35535353..9f6dab231 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts @@ -7,13 +7,12 @@ import { makeMongoDbPageStore, MongoDbEmbeddedContentStore, MongoDbPageStore, - PageStore, persistPages, updatePages, } from "."; import { makeMockPageStore } from "../test/MockPageStore"; import * as chunkPageModule from "../chunk/chunkPage"; -import { EmbeddedContentStore, EmbeddedContent } from "./EmbeddedContent"; +import { EmbeddedContentStore, EmbeddedContent, GetSourcesMatchParams } from "./EmbeddedContent"; import { Embedder } from "../embed"; import { Page, PersistedPage } from "."; import { strict as assert } from "assert"; @@ -41,7 +40,7 @@ export const makeMockEmbeddedContentStore = (): EmbeddedContentStore => { metadata: { embeddingName: "test", }, - async getDataSources(matchQuery: any): Promise { + async getDataSources(matchQuery: GetSourcesMatchParams): Promise { return []; }, }; @@ -328,7 +327,7 @@ describe("updateEmbeddedContent", () => { url: "test2.com", format: "html", sourceName: "source2", - body: "hello source 1", + body: "hello source 2", }, ], }, diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts index be1b2bfa5..35507dfdf 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts @@ -75,15 +75,11 @@ export const updateEmbeddedContent = async ({ ); const dataSourcesWithChangedChunking = await embeddedContentStore.getDataSources({ - chunkAlgoHash: { $ne: chunkAlgoHash }, - // run on specific source names if specified, run on all if not - ...(sourceNames - ? { - sourceName: { - $in: sourceNames, - }, - } - : undefined), + chunkAlgoHash: { + hashValue: chunkAlgoHash, + operation: "notEquals" + }, + sourceNames }); // find all pages with changed chunking, ignoring since date because // we want to re-chunk all pages with the new chunkAlgoHash, even if there were no other changes to the page From 743515f6a2dcd3af27913233e253eb182e821018 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Wed, 22 Jan 2025 15:32:53 -0500 Subject: [PATCH 14/19] lint --- .../src/contentStore/EmbeddedContent.ts | 4 +-- .../MongoDbEmbeddedContentStore.ts | 26 +++++++++++-------- .../updateEmbeddedContent.test.ts | 12 ++++++--- .../src/contentStore/updateEmbeddedContent.ts | 6 ++--- 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts index fdece3c0f..9bd65fc21 100644 --- a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts @@ -79,8 +79,8 @@ export interface GetSourcesMatchParams { sourceNames?: string[]; chunkAlgoHash: { hashValue: string; - operation: "equals" | "notEquals" - } + operation: "equals" | "notEquals"; + }; } /** diff --git a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts index 4e5d5eea2..b983392e1 100644 --- a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts +++ b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts @@ -1,6 +1,10 @@ import { pageIdentity, PersistedPage } from "."; import { DatabaseConnection } from "../DatabaseConnection"; -import { EmbeddedContent, EmbeddedContentStore, GetSourcesMatchParams } from "./EmbeddedContent"; +import { + EmbeddedContent, + EmbeddedContentStore, + GetSourcesMatchParams, +} from "./EmbeddedContent"; import { FindNearestNeighborsOptions, WithScore } from "../VectorStore"; import { MakeMongoDbDatabaseConnectionParams, @@ -58,15 +62,15 @@ export type MongoDbEmbeddedContentStore = EmbeddedContentStore & init(): Promise; }; -function makeMatchQuery ({sourceNames, chunkAlgoHash}: GetSourcesMatchParams) { - let operator = ''; - if(chunkAlgoHash.operation === 'equals'){ - operator = '$eq' - } - if(chunkAlgoHash.operation === 'notEquals'){ - operator = '$ne' - } - return ({ +function makeMatchQuery({ sourceNames, chunkAlgoHash }: GetSourcesMatchParams) { + let operator = ""; + if (chunkAlgoHash.operation === "equals") { + operator = "$eq"; + } + if (chunkAlgoHash.operation === "notEquals") { + operator = "$ne"; + } + return { chunkAlgoHash: { [operator]: chunkAlgoHash.hashValue }, // run on specific source names if specified, run on all if not ...(sourceNames @@ -76,7 +80,7 @@ function makeMatchQuery ({sourceNames, chunkAlgoHash}: GetSourcesMatchParams) { }, } : undefined), - }); + }; } export function makeMongoDbEmbeddedContentStore({ diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts index 9f6dab231..7e69b6997 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts @@ -12,7 +12,11 @@ import { } from "."; import { makeMockPageStore } from "../test/MockPageStore"; import * as chunkPageModule from "../chunk/chunkPage"; -import { EmbeddedContentStore, EmbeddedContent, GetSourcesMatchParams } from "./EmbeddedContent"; +import { + EmbeddedContentStore, + EmbeddedContent, + GetSourcesMatchParams, +} from "./EmbeddedContent"; import { Embedder } from "../embed"; import { Page, PersistedPage } from "."; import { strict as assert } from "assert"; @@ -376,12 +380,12 @@ describe("updateEmbeddedContent", () => { afterEach(async () => { await pageStore?.drop(); - await embedStore.drop(); + await embedStore?.drop(); }); afterAll(async () => { await pageStore?.close(); - await embedStore.close(); - await mongoClient.close(); + await embedStore?.close(); + await mongoClient?.close(); await mongod?.stop(); }); diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts index 35507dfdf..f6c4a086a 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts @@ -75,11 +75,11 @@ export const updateEmbeddedContent = async ({ ); const dataSourcesWithChangedChunking = await embeddedContentStore.getDataSources({ - chunkAlgoHash: { + chunkAlgoHash: { hashValue: chunkAlgoHash, - operation: "notEquals" + operation: "notEquals", }, - sourceNames + sourceNames, }); // find all pages with changed chunking, ignoring since date because // we want to re-chunk all pages with the new chunkAlgoHash, even if there were no other changes to the page From dcfff5b015319b3cb4a04e21b6ec068334c38d7b Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Wed, 22 Jan 2025 23:47:57 -0500 Subject: [PATCH 15/19] fix beforeAll --- .../src/contentStore/updateEmbeddedContent.test.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts index 7e69b6997..5e3e2d178 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts @@ -342,7 +342,8 @@ describe("updateEmbeddedContent", () => { beforeAll(async () => { mongod = await MongoMemoryReplSet.create(); uri = mongod.getUri(); - mongoClient = await MongoClient.connect(mongod.getUri(), {}); + mongoClient = new MongoClient(uri); + await mongoClient.connect(); }); beforeEach(async () => { // setup mongo client, page store, and embedded content store From 517ac50c8cd8281575d470c0e746143324540dd6 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Tue, 25 Mar 2025 12:38:40 -0400 Subject: [PATCH 16/19] Apply suggestions from code review: changes to comment copy Co-authored-by: Nick Larew --- packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts | 2 +- .../mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts index 9bd65fc21..dc08dad8b 100644 --- a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts @@ -124,7 +124,7 @@ export type EmbeddedContentStore = VectorStore & { init?: () => Promise; /** - Get the data sources that match the given query. + Get the names of ingested data sources that match the given query. */ getDataSources(matchQuery: GetSourcesMatchParams): Promise; }; diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts index f6c4a086a..0b927e0ea 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts @@ -36,7 +36,7 @@ const getHashForFunc = ( }; /** - (Re-)embeddedContent the pages in the page store that have changed (copy change, or chunking algothrim change) + (Re-)embeddedContent the pages in the page store that have changed (content and/or chunking algorithm change) since the given date and stores the embeddedContent in the embeddedContent store. */ export const updateEmbeddedContent = async ({ From c9ba2fb4c4135d1eaef024fa3ee578b1f95aae10 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Tue, 25 Mar 2025 15:25:48 -0400 Subject: [PATCH 17/19] address feedback --- .../src/contentStore/EmbeddedContent.ts | 2 +- .../MongoDbEmbeddedContentStore.test.ts | 9 ++++--- .../MongoDbEmbeddedContentStore.ts | 8 +----- .../updateEmbeddedContent.test.ts | 3 ++- .../src/contentStore/updateEmbeddedContent.ts | 26 +++++++++++++++++-- 5 files changed, 34 insertions(+), 14 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts index dc08dad8b..265fb7621 100644 --- a/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/EmbeddedContent.ts @@ -1,4 +1,4 @@ -import { Page, PersistedPage } from "."; +import { Page } from "."; import { VectorStore } from "../VectorStore"; /** diff --git a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.test.ts b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.test.ts index 120e6d14a..9d21d2e39 100644 --- a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.test.ts +++ b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.test.ts @@ -144,10 +144,13 @@ describe("MongoDbEmbeddedContentStore", () => { assert(store); const originalPageEmbedding = await store.loadEmbeddedContent({ page }); assert(originalPageEmbedding.length === 1); - + const newEmbeddings = [{ ...originalPageEmbedding[0], text: "new text" }]; - await store.updateEmbeddedContent({ page, embeddedContent: newEmbeddings }); - + await store.updateEmbeddedContent({ + page, + embeddedContent: newEmbeddings, + }); + const pageEmbeddings = await store.loadEmbeddedContent({ page }); expect(pageEmbeddings.length).toBe(1); expect(pageEmbeddings[0].text).toBe("new text"); diff --git a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts index b983392e1..198c1ae5f 100644 --- a/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts +++ b/packages/mongodb-rag-core/src/contentStore/MongoDbEmbeddedContentStore.ts @@ -63,13 +63,7 @@ export type MongoDbEmbeddedContentStore = EmbeddedContentStore & }; function makeMatchQuery({ sourceNames, chunkAlgoHash }: GetSourcesMatchParams) { - let operator = ""; - if (chunkAlgoHash.operation === "equals") { - operator = "$eq"; - } - if (chunkAlgoHash.operation === "notEquals") { - operator = "$ne"; - } + const operator = chunkAlgoHash.operation === "equals" ? "$eq" : "$ne"; return { chunkAlgoHash: { [operator]: chunkAlgoHash.hashValue }, // run on specific source names if specified, run on all if not diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts index 5e3e2d178..58b7e9bac 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts @@ -67,7 +67,8 @@ const embedder = { }, }; -describe("updateEmbeddedContent", () => { +// TODO: deprecate mock store and use mongodb-memory-server instead. Move these test cases into the describe block below which uses mongodb-memory-server. +describe("updateEmbeddedContent (test cases using mock store)", () => { it("deletes embedded content for deleted page", async () => { const pageStore = makeMockPageStore(); diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts index 0b927e0ea..9518817ec 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts @@ -36,8 +36,30 @@ const getHashForFunc = ( }; /** - (Re-)embeddedContent the pages in the page store that have changed (content and/or chunking algorithm change) - since the given date and stores the embeddedContent in the embeddedContent store. + Updates embedded content for pages that have changed since a given date or have different chunking hashes. + + @param options - The configuration options for updating embedded content + @param options.since - The date from which to check for content changes + @param options.embeddedContentStore - The store containing embedded content + @param options.pageStore - The store containing pages + @param options.sourceNames - Optional array of source names to filter pages by + @param options.embedder - The embedder instance used to generate embeddings + @param options.chunkOptions - Optional configuration for chunking algorithm + @param options.concurrencyOptions - Optional configuration for controlling concurrency during embedding + + @remarks + The function performs the following steps: + 1. Finds pages with content changes since the specified date + 2. Identifies pages with different chunking hashes from the current algorithm + 3. Processes both sets of pages by either: + - Deleting embedded content for deleted pages + - Updating embedded content for created/updated pages + + Processing is done with configurable concurrency to manage system resources. + + @throws Will throw an error if there are issues accessing stores or processing pages + + @returns Promise that resolves when all updates are complete */ export const updateEmbeddedContent = async ({ since, From 0213ad3c66cfafdb14703914b65b9ce712a3b121 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Tue, 25 Mar 2025 16:05:59 -0400 Subject: [PATCH 18/19] test case descriptions --- .../src/contentStore/updateEmbeddedContent.test.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts index 58b7e9bac..fcdd0d228 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.test.ts @@ -67,8 +67,8 @@ const embedder = { }, }; -// TODO: deprecate mock store and use mongodb-memory-server instead. Move these test cases into the describe block below which uses mongodb-memory-server. -describe("updateEmbeddedContent (test cases using mock store)", () => { +// TODO: deprecate mock store and use mongodb-memory-server instead. +describe("updateEmbeddedContent", () => { it("deletes embedded content for deleted page", async () => { const pageStore = makeMockPageStore(); @@ -298,7 +298,7 @@ describe("updateEmbeddedContent (test cases using mock store)", () => { }); // These tests use "mongodb-memory-server", not mockEmbeddedContentStore -describe("updateEmbeddedContent", () => { +describe("updateEmbeddedContent updates chunks based on changes to copy or changes to the chunk algo", () => { let mongod: MongoMemoryReplSet | undefined; let pageStore: MongoDbPageStore; let embedStore: MongoDbEmbeddedContentStore; @@ -391,7 +391,7 @@ describe("updateEmbeddedContent", () => { await mongod?.stop(); }); - it("updates embedded content for pages that have been updated after the 'since' date provided", async () => { + it("should update embedded content only for pages that have been updated (copy change) after the 'since' date provided", async () => { // Modify dates of pages and embedded content for testing const sinceDate = new Date("2024-01-01"); const beforeSinceDate = new Date("2023-01-01"); @@ -448,7 +448,7 @@ describe("updateEmbeddedContent", () => { originalPage2Embedding[0].updated.getTime() ); }); - it("updates embedded content when page has not changed, but chunk algo has, ignoring since date", async () => { + it("should update embedded content when only chunk algo has changed", async () => { // change the chunking algo for the second page, but not the first await updateEmbeddedContent({ since: new Date(), @@ -480,7 +480,7 @@ describe("updateEmbeddedContent", () => { page2Embedding[0].chunkAlgoHash ); }); - it("use a new chunking algo on data sources, some of which have pages that have been updated", async () => { + it("should update embedded content when either chunk algo has changed or copy has changed", async () => { // SETUP: Modify dates of pages and embedded content for this test case const sinceDate = new Date("2024-01-01"); const afterSinceDate = new Date("2025-01-01"); From ffc47fae79c961607a0b62f45a036c724a25ff11 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Mon, 7 Apr 2025 16:15:41 -0400 Subject: [PATCH 19/19] rm unused chunk algo hash caching --- .../src/contentStore/updateEmbeddedContent.ts | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts index 9518817ec..4c794316c 100644 --- a/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts +++ b/packages/mongodb-rag-core/src/contentStore/updateEmbeddedContent.ts @@ -1,7 +1,7 @@ import { createHash } from "crypto"; import { PromisePool } from "@supercharge/promise-pool"; import { ChunkOptions, ChunkFunc, chunkPage } from "../chunk"; -import { EmbeddedContent, EmbeddedContentStore } from "./EmbeddedContent"; +import { EmbeddedContentStore } from "./EmbeddedContent"; import { Embedder } from "../embed"; import { logger } from "../logger"; import { PageStore, PersistedPage } from "."; @@ -19,19 +19,13 @@ export interface EmbedConcurrencyOptions { } const getHashForFunc = ( - chunkAlgoHashes: Map, f: ChunkFunc, o?: Partial ): string => { const data = JSON.stringify(o ?? {}) + f.toString(); - const existingHash = chunkAlgoHashes.get(data); - if (existingHash) { - return existingHash; - } const hash = createHash("sha256"); hash.update(data); const digest = hash.digest("hex"); - chunkAlgoHashes.set(data, digest); return digest; }; @@ -89,9 +83,7 @@ export const updateEmbeddedContent = async ({ }` ); // find all pages with embeddings created using chunkingHashes different from the current chunkingHash - const chunkAlgoHashes = new Map(); const chunkAlgoHash = getHashForFunc( - chunkAlgoHashes, chunkPage, chunkOptions );