mirror of
https://github.com/microsoft/graphrag.git
synced 2025-03-11 01:26:14 +03:00
Add backwards compatibility patch for vector store (#1334)
This commit is contained in:
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"type": "patch",
|
||||
"description": "add backwards compatibility patch to vector store."
|
||||
}
|
||||
3
.vscode/extensions.json
vendored
3
.vscode/extensions.json
vendored
@@ -6,6 +6,7 @@
|
||||
"ms-python.vscode-pylance",
|
||||
"bierner.markdown-mermaid",
|
||||
"streetsidesoftware.code-spell-checker",
|
||||
"ronnidc.nunjucks"
|
||||
"ronnidc.nunjucks",
|
||||
"lucien-martijn.parquet-visualizer",
|
||||
]
|
||||
}
|
||||
|
||||
@@ -182,18 +182,56 @@ async def local_search(
|
||||
------
|
||||
TODO: Document any exceptions to expect.
|
||||
"""
|
||||
# TODO: must update filepath of lancedb (if used) until the new config engine has been implemented
|
||||
#################################### BEGIN PATCH ####################################
|
||||
# TODO: remove the following patch that checks for a vector_store prior to v1 release
|
||||
# TODO: this is a backwards compatibility patch that injects the default vector_store settings into the config if it is not present
|
||||
# Only applicable in situations involving a local vector_store (lancedb). The general idea:
|
||||
# if vector_store not in config:
|
||||
# 1. assume user is running local if vector_store is not in config
|
||||
# 2. insert default vector_store in config
|
||||
# 3 .create lancedb vector_store instance
|
||||
# 4. upload vector embeddings from the input dataframes to the vector_store
|
||||
backwards_compatible = False
|
||||
if not config.embeddings.vector_store:
|
||||
backwards_compatible = True
|
||||
from graphrag.query.input.loaders.dfs import store_entity_semantic_embeddings
|
||||
from graphrag.vector_stores.lancedb import LanceDBVectorStore
|
||||
|
||||
config.embeddings.vector_store = {
|
||||
"type": "lancedb",
|
||||
"db_uri": f"{Path(config.storage.base_dir)}/lancedb",
|
||||
"collection_name": "entity_description_embeddings",
|
||||
"overwrite": True,
|
||||
}
|
||||
_entities = read_indexer_entities(nodes, entities, community_level)
|
||||
description_embedding_store = LanceDBVectorStore(
|
||||
db_uri=config.embeddings.vector_store["db_uri"],
|
||||
collection_name=config.embeddings.vector_store["collection_name"],
|
||||
overwrite=config.embeddings.vector_store["overwrite"],
|
||||
)
|
||||
description_embedding_store.connect(
|
||||
db_uri=config.embeddings.vector_store["db_uri"]
|
||||
)
|
||||
# dump embeddings from the entities list to the description_embedding_store
|
||||
store_entity_semantic_embeddings(
|
||||
entities=_entities, vectorstore=description_embedding_store
|
||||
)
|
||||
#################################### END PATCH ####################################
|
||||
|
||||
# TODO: update filepath of lancedb (if used) until the new config engine has been implemented
|
||||
# TODO: remove the type ignore annotations below once the new config engine has been refactored
|
||||
vector_store_type = config.embeddings.vector_store.get("type") # type: ignore
|
||||
vector_store_args = config.embeddings.vector_store
|
||||
if vector_store_type == "lancedb":
|
||||
if vector_store_type == VectorStoreType.LanceDB and not backwards_compatible:
|
||||
db_uri = config.embeddings.vector_store["db_uri"] # type: ignore
|
||||
lancedb_dir = Path(config.root_dir).resolve() / db_uri
|
||||
vector_store_args["db_uri"] = str(lancedb_dir) # type: ignore
|
||||
|
||||
reporter.info(f"Vector Store Args: {redact(vector_store_args)}") # type: ignore
|
||||
description_embedding_store = _get_embedding_description_store(
|
||||
config_args=vector_store_args, # type: ignore
|
||||
)
|
||||
if not backwards_compatible: # can remove this check and always set the description_embedding_store before v1 release
|
||||
description_embedding_store = _get_embedding_description_store(
|
||||
config_args=vector_store_args, # type: ignore
|
||||
)
|
||||
|
||||
_entities = read_indexer_entities(nodes, entities, community_level)
|
||||
_covariates = read_indexer_covariates(covariates) if covariates is not None else []
|
||||
@@ -205,7 +243,7 @@ async def local_search(
|
||||
entities=_entities,
|
||||
relationships=read_indexer_relationships(relationships),
|
||||
covariates={"claims": _covariates},
|
||||
description_embedding_store=description_embedding_store,
|
||||
description_embedding_store=description_embedding_store, # type: ignore
|
||||
response_type=response_type,
|
||||
)
|
||||
|
||||
@@ -251,18 +289,56 @@ async def local_search_streaming(
|
||||
------
|
||||
TODO: Document any exceptions to expect.
|
||||
"""
|
||||
#################################### BEGIN PATCH ####################################
|
||||
# TODO: remove the following patch that checks for a vector_store prior to v1 release
|
||||
# TODO: this is a backwards compatibility patch that injects the default vector_store settings into the config if it is not present
|
||||
# Only applicable in situations involving a local vector_store (lancedb). The general idea:
|
||||
# if vector_store not in config:
|
||||
# 1. assume user is running local if vector_store is not in config
|
||||
# 2. insert default vector_store in config
|
||||
# 3 .create lancedb vector_store instance
|
||||
# 4. upload vector embeddings from the input dataframes to the vector_store
|
||||
backwards_compatible = False
|
||||
if not config.embeddings.vector_store:
|
||||
backwards_compatible = True
|
||||
from graphrag.query.input.loaders.dfs import store_entity_semantic_embeddings
|
||||
from graphrag.vector_stores.lancedb import LanceDBVectorStore
|
||||
|
||||
config.embeddings.vector_store = {
|
||||
"type": "lancedb",
|
||||
"db_uri": f"{Path(config.storage.base_dir)}/lancedb",
|
||||
"collection_name": "entity_description_embeddings",
|
||||
"overwrite": True,
|
||||
}
|
||||
_entities = read_indexer_entities(nodes, entities, community_level)
|
||||
description_embedding_store = LanceDBVectorStore(
|
||||
db_uri=config.embeddings.vector_store["db_uri"],
|
||||
collection_name=config.embeddings.vector_store["collection_name"],
|
||||
overwrite=config.embeddings.vector_store["overwrite"],
|
||||
)
|
||||
description_embedding_store.connect(
|
||||
db_uri=config.embeddings.vector_store["db_uri"]
|
||||
)
|
||||
# dump embeddings from the entities list to the description_embedding_store
|
||||
store_entity_semantic_embeddings(
|
||||
entities=_entities, vectorstore=description_embedding_store
|
||||
)
|
||||
#################################### END PATCH ####################################
|
||||
|
||||
# TODO: must update filepath of lancedb (if used) until the new config engine has been implemented
|
||||
# TODO: remove the type ignore annotations below once the new config engine has been refactored
|
||||
vector_store_type = config.embeddings.vector_store["type"] # type: ignore
|
||||
vector_store_type = config.embeddings.vector_store.get("type") # type: ignore
|
||||
vector_store_args = config.embeddings.vector_store
|
||||
if vector_store_type == VectorStoreType.LanceDB:
|
||||
if vector_store_type == VectorStoreType.LanceDB and not backwards_compatible:
|
||||
db_uri = config.embeddings.vector_store["db_uri"] # type: ignore
|
||||
lancedb_dir = Path(config.root_dir).resolve() / db_uri
|
||||
vector_store_args["db_uri"] = str(lancedb_dir) # type: ignore
|
||||
|
||||
reporter.info(f"Vector Store Args: {redact(vector_store_args)}") # type: ignore
|
||||
description_embedding_store = _get_embedding_description_store(
|
||||
config_args=vector_store_args, # type: ignore
|
||||
)
|
||||
if not backwards_compatible: # can remove this check and always set the description_embedding_store before v1 release
|
||||
description_embedding_store = _get_embedding_description_store(
|
||||
config_args=vector_store_args, # type: ignore
|
||||
)
|
||||
|
||||
_entities = read_indexer_entities(nodes, entities, community_level)
|
||||
_covariates = read_indexer_covariates(covariates) if covariates is not None else []
|
||||
@@ -274,7 +350,7 @@ async def local_search_streaming(
|
||||
entities=_entities,
|
||||
relationships=read_indexer_relationships(relationships),
|
||||
covariates={"claims": _covariates},
|
||||
description_embedding_store=description_embedding_store,
|
||||
description_embedding_store=description_embedding_store, # type: ignore
|
||||
response_type=response_type,
|
||||
)
|
||||
search_result = search_engine.astream_search(query=query)
|
||||
|
||||
4
tests/fixtures/min-csv/settings.yml
vendored
4
tests/fixtures/min-csv/settings.yml
vendored
@@ -6,12 +6,11 @@ embeddings:
|
||||
type: "lancedb"
|
||||
db_uri: "./tests/fixtures/min-csv/lancedb"
|
||||
collection_name: "lancedb_ci"
|
||||
overwrite: True
|
||||
store_in_table: True
|
||||
|
||||
entity_name_description:
|
||||
title_column: "name"
|
||||
# id_column: "id"
|
||||
# overwrite: true
|
||||
# entity_name: ...
|
||||
# relationship_description: ...
|
||||
# community_report_full_content: ...
|
||||
@@ -20,7 +19,6 @@ embeddings:
|
||||
# document_raw_content: ...
|
||||
# text_unit_text: ...
|
||||
|
||||
|
||||
storage:
|
||||
type: file # or blob
|
||||
base_dir: "output/${timestamp}/artifacts"
|
||||
|
||||
Reference in New Issue
Block a user