mirror of
https://github.com/MaartenGr/KeyBERT.git
synced 2022-03-14 19:18:06 +03:00
Init commit
This commit is contained in:
1
.gitattributes
vendored
Normal file
1
.gitattributes
vendored
Normal file
@@ -0,0 +1 @@
|
||||
*.ipynb linguist-documentation
|
||||
77
.gitignore
vendored
Normal file
77
.gitignore
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
.idea
|
||||
.idea/
|
||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2020, Maarten P. Grootendorst
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
19
Makefile
Normal file
19
Makefile
Normal file
@@ -0,0 +1,19 @@
|
||||
test:
|
||||
pytest
|
||||
|
||||
install:
|
||||
python -m pip install -e .
|
||||
|
||||
install-test:
|
||||
python -m pip install -e ".[test]"
|
||||
python -m pip install -e ".[all]"
|
||||
|
||||
pypi:
|
||||
python setup.py sdist
|
||||
python setup.py bdist_wheel --universal
|
||||
twine upload dist/*
|
||||
|
||||
clean:
|
||||
rm -rf **/.ipynb_checkpoints **/.pytest_cache **/__pycache__ **/**/__pycache__ .ipynb_checkpoints .pytest_cache
|
||||
|
||||
check: test clean
|
||||
6
README.md
Normal file
6
README.md
Normal file
@@ -0,0 +1,6 @@
|
||||
[](https://pypi.org/project/keybert/)
|
||||
[](https://github.com/MaartenGr/keybert/blob/master/LICENSE)
|
||||
[](https://pypi.org/project/keybert/)
|
||||
[](https://pypi.org/project/keybert/)
|
||||
|
||||
# KeyBERT
|
||||
1
docs/algorithm.md
Normal file
1
docs/algorithm.md
Normal file
@@ -0,0 +1 @@
|
||||
# The Algorithm
|
||||
3
docs/api/keybert.md
Normal file
3
docs/api/keybert.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# `KeyBERT`
|
||||
|
||||
::: keybert.model.KeyBERT
|
||||
BIN
docs/img/icon.png
Normal file
BIN
docs/img/icon.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 13 KiB |
1
docs/index.md
Normal file
1
docs/index.md
Normal file
@@ -0,0 +1 @@
|
||||
# KeyBERT
|
||||
0
docs/style.css
Normal file
0
docs/style.css
Normal file
BIN
images/icon.png
Normal file
BIN
images/icon.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 15 KiB |
BIN
images/logo.png
Normal file
BIN
images/logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 18 KiB |
0
keybert/__init__.py
Normal file
0
keybert/__init__.py
Normal file
143
keybert/model.py
Normal file
143
keybert/model.py
Normal file
@@ -0,0 +1,143 @@
|
||||
import numpy as np
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from tqdm import tqdm
|
||||
from typing import List, Union
|
||||
import warnings
|
||||
|
||||
|
||||
class KeyBERT:
|
||||
def __init__(self, model: str = 'distilbert-base-nli-mean-tokens'):
|
||||
self.model = SentenceTransformer(model)
|
||||
self.doc_embeddings = None
|
||||
|
||||
def extract_keywords(self,
|
||||
docs: Union[str, List[str]],
|
||||
keyphrase_length: int = 1,
|
||||
stop_words: Union[str, List[str]] = 'english',
|
||||
top_n: int = 5,
|
||||
min_df: int = 1) -> Union[List[str], List[List[str]]]:
|
||||
""" Extract keywords/keyphrases
|
||||
|
||||
NOTE: I would advise you to use
|
||||
|
||||
Single Document:
|
||||
|
||||
|
||||
Multiple Documents:
|
||||
There is an option to extract keywords for multiple documents
|
||||
that is faster than extraction for multiple single documents.
|
||||
|
||||
However...this method assumes that you can keep the word embeddings
|
||||
for all words in the vocabulary in memory which might be troublesome.
|
||||
|
||||
I would advise against using this option and simply iterating
|
||||
over documents instead if you have limited hardware.
|
||||
|
||||
Arguments:
|
||||
docs: The document(s) for which to extract keywords/keyphrases
|
||||
keyphrase_length: Length, in words, of the extracted keywords/keyphrases
|
||||
stop_words: Stopwords to remove from the document
|
||||
top_n: Return the top n keywords/keyphrases
|
||||
min_df: Minimum document frequency of a word across all documents
|
||||
if keywords for multiple documents need to be extracted
|
||||
|
||||
Returns:
|
||||
keywords: The top n keywords for a document
|
||||
|
||||
"""
|
||||
|
||||
if isinstance(docs, str):
|
||||
return self._extract_keywords_single_doc(docs,
|
||||
keyphrase_length,
|
||||
stop_words,
|
||||
top_n)
|
||||
elif isinstance(docs, list):
|
||||
warnings.warn("Although extracting keywords for multiple documents is faster "
|
||||
"than iterating over single documents, it requires significant memory "
|
||||
"to hold all word embeddings. Use this at your own discretion!")
|
||||
return self._extract_keywords_multiple_docs(docs,
|
||||
keyphrase_length,
|
||||
stop_words,
|
||||
top_n,
|
||||
min_df=min_df)
|
||||
|
||||
def _extract_keywords_single_doc(self,
|
||||
doc: str,
|
||||
keyphrase_length: int = 1,
|
||||
stop_words: Union[str, List[str]] = 'english',
|
||||
top_n: int = 5) -> List[str]:
|
||||
""" Extract keywords/keyphrases for a single document
|
||||
|
||||
Arguments:
|
||||
doc: The document for which to extract keywords/keyphrases
|
||||
keyphrase_length: Length, in words, of the extracted keywords/keyphrases
|
||||
stop_words: Stopwords to remove from the document
|
||||
top_n: Return the top n keywords/keyphrases
|
||||
|
||||
Returns:
|
||||
keywords: The top n keywords for a document
|
||||
|
||||
"""
|
||||
try:
|
||||
# Extract Words
|
||||
n_gram_range = (keyphrase_length, keyphrase_length)
|
||||
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
|
||||
words = count.get_feature_names()
|
||||
|
||||
# Extract Embeddings
|
||||
doc_embeddings = self.model.encode([doc])
|
||||
word_embeddings = self.model.encode(words)
|
||||
|
||||
# Calculate distances and extract keywords
|
||||
distances = cosine_similarity(doc_embeddings, word_embeddings)
|
||||
keywords = [words[index] for index in distances.argsort()[0][-top_n:]]
|
||||
|
||||
return keywords[::-1]
|
||||
except ValueError:
|
||||
return []
|
||||
|
||||
def _extract_keywords_multiple_docs(self,
|
||||
docs: List[str],
|
||||
keyphrase_length: int = 1,
|
||||
stop_words: str = 'english',
|
||||
top_n: int = 5,
|
||||
min_df: int = 1):
|
||||
""" Extract keywords/keyphrases for a multiple documents
|
||||
|
||||
Arguments:
|
||||
docs: The document for which to extract keywords/keyphrases
|
||||
keyphrase_length: Length, in words, of the extracted keywords/keyphrases
|
||||
stop_words: Stopwords to remove from the document
|
||||
top_n: Return the top n keywords/keyphrases
|
||||
min_df: The minimum frequency of words
|
||||
|
||||
Returns:
|
||||
keywords: The top n keywords for a document
|
||||
|
||||
"""
|
||||
# Extract words
|
||||
n_gram_range = (keyphrase_length, keyphrase_length)
|
||||
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words, min_df=min_df).fit(docs)
|
||||
words = count.get_feature_names()
|
||||
df = count.transform(docs)
|
||||
|
||||
# Extract embeddings
|
||||
word_embeddings = self.model.encode(words, show_progress_bar=True)
|
||||
doc_embeddings = self.model.encode(docs, show_progress_bar=True)
|
||||
|
||||
# Extract keywords
|
||||
keywords = []
|
||||
for index, doc in tqdm(enumerate(docs)):
|
||||
doc_words = [words[i] for i in df[index].nonzero()[1]]
|
||||
|
||||
if doc_words:
|
||||
doc_word_embeddings = np.array([word_embeddings[i] for i in df[index].nonzero()[1]])
|
||||
distances = cosine_similarity([doc_embeddings[index]], doc_word_embeddings)[0]
|
||||
doc_keywords = [doc_words[i] for i in distances.argsort()[-top_n:]]
|
||||
keywords.append(doc_keywords)
|
||||
else:
|
||||
keywords.append(["None Found"])
|
||||
|
||||
return keywords
|
||||
42
mkdocs.yml
Normal file
42
mkdocs.yml
Normal file
@@ -0,0 +1,42 @@
|
||||
site_name: KeyBERT
|
||||
extra_css: [style.css]
|
||||
repo_url: https://github.com/MaartenGr/keyBERT
|
||||
site_url: https://maartengr.github.io/keyBERT/
|
||||
site_description: Leveraging BERT to extract important keywords
|
||||
site_author: Maarten P. Grootendorst
|
||||
use_directory_urls: false
|
||||
nav:
|
||||
- Home:
|
||||
- Index: index.md
|
||||
- API:
|
||||
- KeyBERT: api/keybert.md
|
||||
plugins:
|
||||
- mkdocstrings:
|
||||
watch:
|
||||
- keybert
|
||||
- search
|
||||
copyright: Copyright © 2020 Maintained by <a href="https://github.com/MaartenGr">Maarten</a>.
|
||||
theme:
|
||||
custom_dir: images/
|
||||
name: material
|
||||
icon:
|
||||
logo: material/library
|
||||
font:
|
||||
text: Ubuntu
|
||||
code: Ubuntu Mono
|
||||
favicon: icon.png
|
||||
logo: icon.png
|
||||
feature:
|
||||
tabs: true
|
||||
palette:
|
||||
primary: indigo
|
||||
accent: blue
|
||||
markdown_extensions:
|
||||
- codehilite
|
||||
- pymdownx.inlinehilite
|
||||
- pymdownx.details
|
||||
- pymdownx.tabbed
|
||||
- pymdownx.highlight:
|
||||
use_pygments: true
|
||||
- toc:
|
||||
permalink: true
|
||||
58
setup.py
Normal file
58
setup.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import setuptools
|
||||
|
||||
test_packages = [
|
||||
"pytest>=5.4.3",
|
||||
"pytest-cov>=2.6.1"
|
||||
]
|
||||
|
||||
base_packages = [
|
||||
"sentence-transformers>=0.3.8",
|
||||
"scikit-learn>=0.22.2",
|
||||
"numpy>=1.18.5",
|
||||
]
|
||||
|
||||
docs_packages = [
|
||||
"mkdocs==1.1",
|
||||
"mkdocs-material==4.6.3",
|
||||
"mkdocstrings==0.8.0",
|
||||
]
|
||||
|
||||
dev_packages = docs_packages + test_packages
|
||||
|
||||
with open("README.md", "r") as fh:
|
||||
long_description = fh.read()
|
||||
|
||||
setuptools.setup(
|
||||
name="keybert",
|
||||
packages=["keybert"],
|
||||
version="0.0.1",
|
||||
author="Maarten Grootendorst",
|
||||
author_email="maartengrootendorst@gmail.com",
|
||||
description="KeyBERT performs keyword extraction with state-of-the-art transformer models.",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/MaartenGr/keyBERT",
|
||||
keywords="nlp bert keyword extraction embeddings",
|
||||
classifiers=[
|
||||
"Programming Language :: Python",
|
||||
"Intended Audience :: Science/Research",
|
||||
"Intended Audience :: Developers",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Topic :: Scientific/Engineering",
|
||||
"Operating System :: Microsoft :: Windows",
|
||||
"Operating System :: POSIX",
|
||||
"Operating System :: Unix",
|
||||
"Operating System :: MacOS",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
],
|
||||
install_requires=base_packages,
|
||||
extras_require={
|
||||
"test": test_packages,
|
||||
"docs": docs_packages,
|
||||
"dev": dev_packages,
|
||||
},
|
||||
python_requires='>=3.6',
|
||||
)
|
||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
15
tests/conftest.py
Normal file
15
tests/conftest.py
Normal file
@@ -0,0 +1,15 @@
|
||||
# from bertopic import BERTopic
|
||||
# import pytest
|
||||
#
|
||||
#
|
||||
# @pytest.fixture(scope="module")
|
||||
# def base_bertopic():
|
||||
# model = BERTopic(bert_model='distilbert-base-nli-mean-tokens',
|
||||
# top_n_words=20,
|
||||
# nr_topics=None,
|
||||
# n_gram_range=(1, 1),
|
||||
# min_topic_size=30,
|
||||
# n_neighbors=15,
|
||||
# n_components=5,
|
||||
# verbose=False)
|
||||
# return model
|
||||
20
tests/test_model.py
Normal file
20
tests/test_model.py
Normal file
@@ -0,0 +1,20 @@
|
||||
# import pytest
|
||||
# import numpy as np
|
||||
# import pandas as pd
|
||||
# from unittest import mock
|
||||
#
|
||||
# from sklearn.datasets import fetch_20newsgroups, make_blobs
|
||||
# from keybert import KeyBERT
|
||||
#
|
||||
# newsgroup_docs = fetch_20newsgroups(subset='all')['data'][:1000]
|
||||
#
|
||||
# @mock.patch("bertopic.model.BERTopic._extract_embeddings")
|
||||
# def test_fit_transform(embeddings, base_bertopic):
|
||||
# """ Test whether predictions are correctly made """
|
||||
# blobs, _ = make_blobs(n_samples=len(newsgroup_docs), centers=5, n_features=768, random_state=42)
|
||||
# embeddings.return_value = blobs
|
||||
# predictions = base_bertopic.fit_transform(newsgroup_docs)
|
||||
#
|
||||
# assert isinstance(predictions, list)
|
||||
# assert len(predictions) == len(newsgroup_docs)
|
||||
# assert not set(predictions).difference(set(base_bertopic.get_topics().keys()))
|
||||
0
theme/style.css
Normal file
0
theme/style.css
Normal file
Reference in New Issue
Block a user