mirror of
https://github.com/microsoft/graphrag.git
synced 2025-03-11 01:26:14 +03:00
Remove spacy model from toml (#1771)
* Remove spacy model from toml * Semver
This commit is contained in:
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"type": "patch",
|
||||
"description": "Remove spacy model from toml file"
|
||||
}
|
||||
@@ -3,8 +3,13 @@
|
||||
|
||||
"""Base class for noun phrase extractors."""
|
||||
|
||||
import logging
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
import spacy
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseNounPhraseExtractor(metaclass=ABCMeta):
|
||||
"""Abstract base class for noun phrase extractors."""
|
||||
@@ -37,3 +42,20 @@ class BaseNounPhraseExtractor(metaclass=ABCMeta):
|
||||
@abstractmethod
|
||||
def __str__(self) -> str:
|
||||
"""Return string representation of the extractor, used for cache key generation."""
|
||||
|
||||
@staticmethod
|
||||
def load_spacy_model(
|
||||
model_name: str, exclude: list[str] | None = None
|
||||
) -> spacy.language.Language:
|
||||
"""Load a SpaCy model."""
|
||||
if exclude is None:
|
||||
exclude = []
|
||||
try:
|
||||
return spacy.load(model_name, exclude=exclude)
|
||||
except OSError:
|
||||
msg = f"Model `{model_name}` not found. Attempting to download..."
|
||||
log.info(msg)
|
||||
from spacy.cli.download import download
|
||||
|
||||
download(model_name)
|
||||
return spacy.load(model_name, exclude=exclude)
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
|
||||
from typing import Any
|
||||
|
||||
import spacy
|
||||
from spacy.tokens.doc import Doc
|
||||
|
||||
from graphrag.index.operations.build_noun_graph.np_extractors.base import (
|
||||
@@ -57,9 +56,13 @@ class CFGNounPhraseExtractor(BaseNounPhraseExtractor):
|
||||
self.include_named_entities = include_named_entities
|
||||
self.exclude_entity_tags = exclude_entity_tags
|
||||
if not include_named_entities:
|
||||
self.nlp = spacy.load(model_name, exclude=["lemmatizer", "parser", "ner"])
|
||||
self.nlp = self.load_spacy_model(
|
||||
model_name, exclude=["lemmatizer", "parser", "ner"]
|
||||
)
|
||||
else:
|
||||
self.nlp = spacy.load(model_name, exclude=["lemmatizer", "parser"])
|
||||
self.nlp = self.load_spacy_model(
|
||||
model_name, exclude=["lemmatizer", "parser"]
|
||||
)
|
||||
|
||||
self.exclude_pos_tags = exclude_pos_tags
|
||||
self.noun_phrase_grammars = noun_phrase_grammars
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
|
||||
from typing import Any
|
||||
|
||||
import spacy
|
||||
from spacy.tokens.span import Span
|
||||
from spacy.util import filter_spans
|
||||
|
||||
@@ -55,9 +54,9 @@ class SyntacticNounPhraseExtractor(BaseNounPhraseExtractor):
|
||||
self.include_named_entities = include_named_entities
|
||||
self.exclude_entity_tags = exclude_entity_tags
|
||||
if not include_named_entities:
|
||||
self.nlp = spacy.load(model_name, exclude=["lemmatizer", "ner"])
|
||||
self.nlp = self.load_spacy_model(model_name, exclude=["lemmatizer", "ner"])
|
||||
else:
|
||||
self.nlp = spacy.load(model_name, exclude=["lemmatizer"])
|
||||
self.nlp = self.load_spacy_model(model_name, exclude=["lemmatizer"])
|
||||
|
||||
self.exclude_pos_tags = exclude_pos_tags
|
||||
|
||||
|
||||
16
poetry.lock
generated
16
poetry.lock
generated
@@ -1107,20 +1107,6 @@ files = [
|
||||
{file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "en_core_web_md"
|
||||
version = "3.8.0"
|
||||
description = "English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "en_core_web_md-3.8.0.tar.gz", hash = "sha256:504fe70715dca9464bf11e385fbe80e92ae127394df3f9c45242926082425551"},
|
||||
]
|
||||
|
||||
[package.source]
|
||||
type = "url"
|
||||
url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0.tar.gz"
|
||||
|
||||
[[package]]
|
||||
name = "environs"
|
||||
version = "11.2.1"
|
||||
@@ -5845,4 +5831,4 @@ files = [
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.10,<3.13"
|
||||
content-hash = "10f2f9d7477992ddaf5d952fa5d0d0238a655a34d3638b4ce7914452d57a755b"
|
||||
content-hash = "1258bf4de115ac572a4e01bb8eebb92d85300378c0c6b57aac3b468d49ce53e4"
|
||||
|
||||
@@ -47,7 +47,6 @@ format-jinja = """
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.10,<3.13"
|
||||
environs = "^11.0.0"
|
||||
en-core-web-md = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0.tar.gz" }
|
||||
|
||||
# Vector Stores
|
||||
azure-search-documents = "^11.5.2"
|
||||
|
||||
Reference in New Issue
Block a user