Add backwards compatibility patch for vector store (#1334)

This commit is contained in:
Josh Bradley
2024-10-29 14:54:08 -04:00
committed by GitHub
parent 83026bdb26
commit 0cc79b9cf7
4 changed files with 95 additions and 16 deletions

View File

@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "add backwards compatibility patch to vector store."
}

View File

@@ -6,6 +6,7 @@
"ms-python.vscode-pylance",
"bierner.markdown-mermaid",
"streetsidesoftware.code-spell-checker",
"ronnidc.nunjucks"
"ronnidc.nunjucks",
"lucien-martijn.parquet-visualizer",
]
}

View File

@@ -182,18 +182,56 @@ async def local_search(
------
TODO: Document any exceptions to expect.
"""
# TODO: must update filepath of lancedb (if used) until the new config engine has been implemented
#################################### BEGIN PATCH ####################################
# TODO: remove the following patch that checks for a vector_store prior to v1 release
# TODO: this is a backwards compatibility patch that injects the default vector_store settings into the config if it is not present
# Only applicable in situations involving a local vector_store (lancedb). The general idea:
# if vector_store not in config:
# 1. assume user is running local if vector_store is not in config
# 2. insert default vector_store in config
# 3 .create lancedb vector_store instance
# 4. upload vector embeddings from the input dataframes to the vector_store
backwards_compatible = False
if not config.embeddings.vector_store:
backwards_compatible = True
from graphrag.query.input.loaders.dfs import store_entity_semantic_embeddings
from graphrag.vector_stores.lancedb import LanceDBVectorStore
config.embeddings.vector_store = {
"type": "lancedb",
"db_uri": f"{Path(config.storage.base_dir)}/lancedb",
"collection_name": "entity_description_embeddings",
"overwrite": True,
}
_entities = read_indexer_entities(nodes, entities, community_level)
description_embedding_store = LanceDBVectorStore(
db_uri=config.embeddings.vector_store["db_uri"],
collection_name=config.embeddings.vector_store["collection_name"],
overwrite=config.embeddings.vector_store["overwrite"],
)
description_embedding_store.connect(
db_uri=config.embeddings.vector_store["db_uri"]
)
# dump embeddings from the entities list to the description_embedding_store
store_entity_semantic_embeddings(
entities=_entities, vectorstore=description_embedding_store
)
#################################### END PATCH ####################################
# TODO: update filepath of lancedb (if used) until the new config engine has been implemented
# TODO: remove the type ignore annotations below once the new config engine has been refactored
vector_store_type = config.embeddings.vector_store.get("type") # type: ignore
vector_store_args = config.embeddings.vector_store
if vector_store_type == "lancedb":
if vector_store_type == VectorStoreType.LanceDB and not backwards_compatible:
db_uri = config.embeddings.vector_store["db_uri"] # type: ignore
lancedb_dir = Path(config.root_dir).resolve() / db_uri
vector_store_args["db_uri"] = str(lancedb_dir) # type: ignore
reporter.info(f"Vector Store Args: {redact(vector_store_args)}") # type: ignore
description_embedding_store = _get_embedding_description_store(
config_args=vector_store_args, # type: ignore
)
if not backwards_compatible: # can remove this check and always set the description_embedding_store before v1 release
description_embedding_store = _get_embedding_description_store(
config_args=vector_store_args, # type: ignore
)
_entities = read_indexer_entities(nodes, entities, community_level)
_covariates = read_indexer_covariates(covariates) if covariates is not None else []
@@ -205,7 +243,7 @@ async def local_search(
entities=_entities,
relationships=read_indexer_relationships(relationships),
covariates={"claims": _covariates},
description_embedding_store=description_embedding_store,
description_embedding_store=description_embedding_store, # type: ignore
response_type=response_type,
)
@@ -251,18 +289,56 @@ async def local_search_streaming(
------
TODO: Document any exceptions to expect.
"""
#################################### BEGIN PATCH ####################################
# TODO: remove the following patch that checks for a vector_store prior to v1 release
# TODO: this is a backwards compatibility patch that injects the default vector_store settings into the config if it is not present
# Only applicable in situations involving a local vector_store (lancedb). The general idea:
# if vector_store not in config:
# 1. assume user is running local if vector_store is not in config
# 2. insert default vector_store in config
# 3 .create lancedb vector_store instance
# 4. upload vector embeddings from the input dataframes to the vector_store
backwards_compatible = False
if not config.embeddings.vector_store:
backwards_compatible = True
from graphrag.query.input.loaders.dfs import store_entity_semantic_embeddings
from graphrag.vector_stores.lancedb import LanceDBVectorStore
config.embeddings.vector_store = {
"type": "lancedb",
"db_uri": f"{Path(config.storage.base_dir)}/lancedb",
"collection_name": "entity_description_embeddings",
"overwrite": True,
}
_entities = read_indexer_entities(nodes, entities, community_level)
description_embedding_store = LanceDBVectorStore(
db_uri=config.embeddings.vector_store["db_uri"],
collection_name=config.embeddings.vector_store["collection_name"],
overwrite=config.embeddings.vector_store["overwrite"],
)
description_embedding_store.connect(
db_uri=config.embeddings.vector_store["db_uri"]
)
# dump embeddings from the entities list to the description_embedding_store
store_entity_semantic_embeddings(
entities=_entities, vectorstore=description_embedding_store
)
#################################### END PATCH ####################################
# TODO: must update filepath of lancedb (if used) until the new config engine has been implemented
# TODO: remove the type ignore annotations below once the new config engine has been refactored
vector_store_type = config.embeddings.vector_store["type"] # type: ignore
vector_store_type = config.embeddings.vector_store.get("type") # type: ignore
vector_store_args = config.embeddings.vector_store
if vector_store_type == VectorStoreType.LanceDB:
if vector_store_type == VectorStoreType.LanceDB and not backwards_compatible:
db_uri = config.embeddings.vector_store["db_uri"] # type: ignore
lancedb_dir = Path(config.root_dir).resolve() / db_uri
vector_store_args["db_uri"] = str(lancedb_dir) # type: ignore
reporter.info(f"Vector Store Args: {redact(vector_store_args)}") # type: ignore
description_embedding_store = _get_embedding_description_store(
config_args=vector_store_args, # type: ignore
)
if not backwards_compatible: # can remove this check and always set the description_embedding_store before v1 release
description_embedding_store = _get_embedding_description_store(
config_args=vector_store_args, # type: ignore
)
_entities = read_indexer_entities(nodes, entities, community_level)
_covariates = read_indexer_covariates(covariates) if covariates is not None else []
@@ -274,7 +350,7 @@ async def local_search_streaming(
entities=_entities,
relationships=read_indexer_relationships(relationships),
covariates={"claims": _covariates},
description_embedding_store=description_embedding_store,
description_embedding_store=description_embedding_store, # type: ignore
response_type=response_type,
)
search_result = search_engine.astream_search(query=query)

View File

@@ -6,12 +6,11 @@ embeddings:
type: "lancedb"
db_uri: "./tests/fixtures/min-csv/lancedb"
collection_name: "lancedb_ci"
overwrite: True
store_in_table: True
entity_name_description:
title_column: "name"
# id_column: "id"
# overwrite: true
# entity_name: ...
# relationship_description: ...
# community_report_full_content: ...
@@ -20,7 +19,6 @@ embeddings:
# document_raw_content: ...
# text_unit_text: ...
storage:
type: file # or blob
base_dir: "output/${timestamp}/artifacts"