Remove spacy model from toml (#1771)

* Remove spacy model from toml

* Semver
This commit is contained in:
Alonso Guevara
2025-02-26 10:58:02 -06:00
committed by GitHub
parent 716f93dd8b
commit b4b8b81c0a
6 changed files with 35 additions and 22 deletions

View File

@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Remove spacy model from toml file"
}

View File

@@ -3,8 +3,13 @@
"""Base class for noun phrase extractors."""
import logging
from abc import ABCMeta, abstractmethod
import spacy
log = logging.getLogger(__name__)
class BaseNounPhraseExtractor(metaclass=ABCMeta):
"""Abstract base class for noun phrase extractors."""
@@ -37,3 +42,20 @@ class BaseNounPhraseExtractor(metaclass=ABCMeta):
@abstractmethod
def __str__(self) -> str:
"""Return string representation of the extractor, used for cache key generation."""
@staticmethod
def load_spacy_model(
model_name: str, exclude: list[str] | None = None
) -> spacy.language.Language:
"""Load a SpaCy model."""
if exclude is None:
exclude = []
try:
return spacy.load(model_name, exclude=exclude)
except OSError:
msg = f"Model `{model_name}` not found. Attempting to download..."
log.info(msg)
from spacy.cli.download import download
download(model_name)
return spacy.load(model_name, exclude=exclude)

View File

@@ -5,7 +5,6 @@
from typing import Any
import spacy
from spacy.tokens.doc import Doc
from graphrag.index.operations.build_noun_graph.np_extractors.base import (
@@ -57,9 +56,13 @@ class CFGNounPhraseExtractor(BaseNounPhraseExtractor):
self.include_named_entities = include_named_entities
self.exclude_entity_tags = exclude_entity_tags
if not include_named_entities:
self.nlp = spacy.load(model_name, exclude=["lemmatizer", "parser", "ner"])
self.nlp = self.load_spacy_model(
model_name, exclude=["lemmatizer", "parser", "ner"]
)
else:
self.nlp = spacy.load(model_name, exclude=["lemmatizer", "parser"])
self.nlp = self.load_spacy_model(
model_name, exclude=["lemmatizer", "parser"]
)
self.exclude_pos_tags = exclude_pos_tags
self.noun_phrase_grammars = noun_phrase_grammars

View File

@@ -5,7 +5,6 @@
from typing import Any
import spacy
from spacy.tokens.span import Span
from spacy.util import filter_spans
@@ -55,9 +54,9 @@ class SyntacticNounPhraseExtractor(BaseNounPhraseExtractor):
self.include_named_entities = include_named_entities
self.exclude_entity_tags = exclude_entity_tags
if not include_named_entities:
self.nlp = spacy.load(model_name, exclude=["lemmatizer", "ner"])
self.nlp = self.load_spacy_model(model_name, exclude=["lemmatizer", "ner"])
else:
self.nlp = spacy.load(model_name, exclude=["lemmatizer"])
self.nlp = self.load_spacy_model(model_name, exclude=["lemmatizer"])
self.exclude_pos_tags = exclude_pos_tags

16
poetry.lock generated
View File

@@ -1107,20 +1107,6 @@ files = [
{file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
]
[[package]]
name = "en_core_web_md"
version = "3.8.0"
description = "English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer."
optional = false
python-versions = "*"
files = [
{file = "en_core_web_md-3.8.0.tar.gz", hash = "sha256:504fe70715dca9464bf11e385fbe80e92ae127394df3f9c45242926082425551"},
]
[package.source]
type = "url"
url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0.tar.gz"
[[package]]
name = "environs"
version = "11.2.1"
@@ -5845,4 +5831,4 @@ files = [
[metadata]
lock-version = "2.0"
python-versions = ">=3.10,<3.13"
content-hash = "10f2f9d7477992ddaf5d952fa5d0d0238a655a34d3638b4ce7914452d57a755b"
content-hash = "1258bf4de115ac572a4e01bb8eebb92d85300378c0c6b57aac3b468d49ce53e4"

View File

@@ -47,7 +47,6 @@ format-jinja = """
[tool.poetry.dependencies]
python = ">=3.10,<3.13"
environs = "^11.0.0"
en-core-web-md = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0.tar.gz" }
# Vector Stores
azure-search-documents = "^11.5.2"